Пример #1
0
 def __init__(self,
              path_or_stream,
              dest_dir=None,
              log=None,
              notes_text=None):
     self.docx = DOCX(path_or_stream, log=log)
     self.log = self.docx.log
     self.notes_text = notes_text or _('Notes')
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.styles = Styles()
     self.images = Images()
     self.tables = Tables()
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ), self.body)
     self.html.text = '\n\t'
     self.html[0].text = '\n\t\t'
     self.html[0].tail = '\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = canonicalize_lang(self.mi.language)
     if lang and lang != 'und':
         lang = lang_as_iso639_1(lang)
         if lang:
             self.html.set('lang', lang)
Пример #2
0
 def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
     self.docx = DOCX(path_or_stream, log=log)
     self.log = self.docx.log
     self.notes_text = notes_text or _("Notes")
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.styles = Styles()
     self.images = Images()
     self.tables = Tables()
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset="utf-8"),
             TITLE(self.mi.title or _("Unknown")),
             LINK(rel="stylesheet", type="text/css", href="docx.css"),
         ),
         self.body,
     )
     self.html.text = "\n\t"
     self.html[0].text = "\n\t\t"
     self.html[0].tail = "\n"
     for child in self.html[0]:
         child.tail = "\n\t\t"
     self.html[0][-1].tail = "\n\t"
     self.html[1].text = self.html[1].tail = "\n"
     lang = canonicalize_lang(self.mi.language)
     if lang and lang != "und":
         lang = lang_as_iso639_1(lang)
         if lang:
             self.html.set("lang", lang)
Пример #3
0
 def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
     self.docx = DOCX(path_or_stream, log=log)
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.notes_text = notes_text or _('Notes')
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.tables = Tables()
     self.styles = Styles(self.tables)
     self.images = Images()
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ),
         self.body
     )
     self.html.text='\n\t'
     self.html[0].text='\n\t\t'
     self.html[0].tail='\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = canonicalize_lang(self.mi.language)
     if lang and lang != 'und':
         lang = lang_as_iso639_1(lang)
         if lang:
             self.html.set('lang', lang)
Пример #4
0
 def __init__(self,
              path_or_stream,
              dest_dir=None,
              log=None,
              detect_cover=True,
              notes_text=None,
              notes_nopb=False,
              nosupsub=False):
     self.docx = DOCX(path_or_stream, log=log)
     self.namespace = self.docx.namespace
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.detect_cover = detect_cover
     self.notes_text = notes_text or _('Notes')
     self.notes_nopb = notes_nopb
     self.nosupsub = nosupsub
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.theme = Theme(self.namespace)
     self.settings = Settings(self.namespace)
     self.tables = Tables(self.namespace)
     self.fields = Fields(self.namespace)
     self.styles = Styles(self.namespace, self.tables)
     self.images = Images(self.namespace, self.log)
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ), self.body)
     self.html.text = '\n\t'
     self.html[0].text = '\n\t\t'
     self.html[0].tail = '\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = html_lang(self.mi.language)
     if lang:
         self.html.set('lang', lang)
         self.doc_lang = lang
     else:
         self.doc_lang = None
Пример #5
0
 def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
     self.docx = DOCX(path_or_stream, log=log)
     self.namespace = self.docx.namespace
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.detect_cover = detect_cover
     self.notes_text = notes_text or _('Notes')
     self.notes_nopb = notes_nopb
     self.nosupsub = nosupsub
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.theme = Theme(self.namespace)
     self.settings = Settings(self.namespace)
     self.tables = Tables(self.namespace)
     self.fields = Fields(self.namespace)
     self.styles = Styles(self.namespace, self.tables)
     self.images = Images(self.namespace, self.log)
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ),
         self.body
     )
     self.html.text='\n\t'
     self.html[0].text='\n\t\t'
     self.html[0].tail='\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = html_lang(self.mi.language)
     if lang:
         self.html.set('lang', lang)
         self.doc_lang = lang
     else:
         self.doc_lang = None
Пример #6
0
class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
        self.docx = DOCX(path_or_stream, log=log)
        self.namespace = self.docx.namespace
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.notes_nopb = notes_nopb
        self.nosupsub = nosupsub
        self.dest_dir = dest_dir or getcwd()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.theme = Theme(self.namespace)
        self.settings = Settings(self.namespace)
        self.tables = Tables(self.namespace)
        self.fields = Fields(self.namespace)
        self.styles = Styles(self.namespace, self.tables)
        self.images = Images(self.namespace, self.log)
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
        self.html.text='\n\t'
        self.html[0].text='\n\t\t'
        self.html[0].tail='\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = html_lang(self.mi.language)
        if lang:
            self.html.set('lang', lang)
            self.doc_lang = lang
        else:
            self.doc_lang = None

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.frame_map = {}
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        self.toc_anchor = None
        self.block_runs = []
        paras = []

        self.log.debug('Converting Word markup to HTML')

        self.read_page_properties(doc)
        self.resolve_alternate_content(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in iteritems(self.page_map):
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)

        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        self.mark_block_runs(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            for anchor, text, note in self.footnotes:
                dl = DL(id=anchor)
                dl.set('class', 'footnote')
                self.body.append(dl)
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)
                self.mark_block_runs(paras)

        for p, wp in iteritems(self.object_map):
            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    list(map(parent.remove, tabs))

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in iteritems(self.object_map):
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in iteritems(self.object_map):
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in iteritems(self.framed_map):
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.fields.polish_markup(self.object_map)

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)

        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(self.namespace, sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = self.namespace.XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(self.namespace, last)
            for x in current:
                self.page_map[x] = pr

    def resolve_alternate_content(self, doc):
        # For proprietary extensions in Word documents use the fallback, spec
        # compliant form
        # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
        for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
            choices = self.namespace.XPath('./mc:Choice')(ac)
            fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
            if fallbacks:
                for choice in choices:
                    ac.remove(choice)

    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            if name and name.startswith('word/word') and not self.docx.exists(name):
                name = name.partition('/')[2]
            return name

        nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
        sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
        sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
        fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
        tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
        foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
        enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
        numbering = self.numbering = Numbering(self.namespace)
        footnotes = self.footnotes = Footnotes(self.namespace)
        fonts = self.fonts = Fonts(self.namespace)

        foraw = enraw = None
        forel, enrel = ({}, {}), ({}, {})
        if sename is not None:
            try:
                seraw = self.docx.read(sename)
            except KeyError:
                self.log.warn('Settings %s do not exist' % sename)
            except EnvironmentError as e:
                if e.errno != errno.ENOENT:
                    raise
                self.log.warn('Settings %s file missing' % sename)
            else:
                self.settings(fromstring(seraw))

        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
            else:
                forel = self.docx.get_relationships(foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
            else:
                enrel = self.docx.get_relationships(enname)
        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if tname is not None:
            try:
                raw = self.docx.read(tname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.theme(fromstring(raw))

        styles_loaded = False
        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts, self.theme)
                styles_loaded = True
        if not styles_loaded:
            self.styles(None, fonts, self.theme)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])

        self.styles.resolve_numbering(numbering)

    def write(self, doc):
        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
        if css:
            with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        for item in opf.manifest:
            if item.media_type == 'text/html':
                item.media_type = guess_type('a.xhtml')[0]
        opf.create_spine(['index.html'])
        if self.cover_image is not None:
            opf.guide.set_cover(self.cover_image)

        def process_guide(E, guide):
            if self.toc_anchor is not None:
                guide.append(E.reference(
                    href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
        with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
        if os.path.getsize(toc_file) == 0:
            os.remove(toc_file)
        return os.path.join(self.dest_dir, 'metadata.opf')

    def read_block_anchors(self, doc):
        doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
            current_bm = set()
            rmap = {v:k for k, v in iteritems(self.object_map)}
            for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
                        for name in current_bm:
                            self.anchor_map[name] = para.get('id')
                        current_bm = set()
                elif p in doc_anchors:
                    anchor = self.namespace.get(p, 'w:name')
                    if anchor:
                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.frame_map[p] = style.frame
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')

        def p_parent(x):
            # Ensure that nested <w:p> tags are handled. These can occur if a
            # textbox is present inside a paragraph.
            while True:
                x = x.getparent()
                try:
                    if x.tag.endswith('}p'):
                        return x
                except AttributeError:
                    break

        for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
            if p_parent(x) is not p:
                continue
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = self.namespace.get(x, 'w:name')
                if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
                    # _GoBack is a special bookmark inserted by Word 2010 for
                    # the return to previous edit feature, we ignore it
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(iteritems(self.anchor_map)):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x
            elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
                old_anchor = current_anchor
                anchor = unicode_type(uuid.uuid4())
                self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
                self.toc_anchor = current_anchor
                if old_anchor is not None:
                    # The previous anchor was not applied to any element
                    for a, t in tuple(iteritems(self.anchor_map)):
                        if t == old_anchor:
                            self.anchor_map[a] = current_anchor
        if current_anchor is not None:
            # This paragraph had no <w:r> descendants
            dest.set('id', current_anchor)
            current_anchor = None

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.bidi is True:
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0 and not style.has_visible_border():
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            try:
                p.remove(elem)
            except ValueError:
                # Probably a hyperlink that spans multiple
                # paragraphs,theoretically we should break this up into
                # multiple hyperlinks, but I can't be bothered.
                elem.getparent().remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self):
        self.resolved_link_map = {}
        for hyperlink, spans in iteritems(self.link_map):
            relationships_by_id = self.link_source_map[hyperlink]
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = self.namespace.get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = self.namespace.get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = self.namespace.get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
                          (rid, anchor))
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v:k for k, v in iteritems(self.object_map)}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink.get('url', None)
            if url is None:
                anchor = hyperlink.get('anchor', None)
                if anchor in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[anchor])
                    continue
                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
            else:
                if url in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[url])
                    continue
                span.set('href', url)

        for img, link, relationships_by_id in self.images.links:
            parent = img.getparent()
            idx = parent.index(img)
            a = A(img)
            a.tail, img.tail = img.tail, None
            parent.insert(idx, a)
            tgt = link.get('target', None)
            if tgt:
                a.set('target', tgt)
            tt = link.get('title', None)
            if tt:
                a.set('title', tt)
            rid = link['id']
            if rid in relationships_by_id:
                dest = relationships_by_id[rid]
                if dest.startswith('#'):
                    if dest[1:] in self.anchor_map:
                        a.set('href', '#' + self.anchor_map[dest[1:]])
                else:
                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if self.namespace.is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                ctext = child.text
                if space != 'preserve':
                    # Remove leading and trailing whitespace. Word ignores
                    # leading and trailing whitespace without preserve
                    ctext = ctext.strip(' \n\r\t')
                # Only use a <span> with white-space:pre-wrap if this element
                # actually needs it, i.e. if it has more than one
                # consecutive space or it has newlines or tabs.
                multi_spaces = self.ms_pat.search(ctext) is not None
                preserve = multi_spaces or self.ws_pat.search(ctext) is not None
                if preserve:
                    text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(ctext)
            elif self.namespace.is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:br'):
                typ = self.namespace.get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = A(SUP(name, id='back_%s' % anchor), href='#' + anchor, title=name)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:tab'):
                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
                text.add_elem(SPAN(NBSP * spaces))
                ans.append(text.elem)
                ans[-1].set('class', 'tab')
            elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
                text.buf.append('\u2011')
            elif self.namespace.is_tag(child, 'w:softHyphen'):
                text.buf.append('\u00ad')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            lang = html_lang(style.lang)
            if lang is not None and lang != self.doc_lang:
                ans.set('lang', lang)
        if style.rtl is True:
            ans.set('dir', 'rtl')
        if is_symbol_font(style.font_family):
            for elem in text:
                if elem.text:
                    elem.text = map_symbol_text(elem.text, style.font_family)
                if elem.tail:
                    elem.tail = map_symbol_text(elem.tail, style.font_family)
            style.font_family = 'sans-serif'
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed[-1].append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')

        if not self.block_runs:
            return
        rmap = {v:k for k, v in iteritems(self.object_map)}
        for border_style, blocks in self.block_runs:
            paras = tuple(rmap[p] for p in blocks)
            for p in paras:
                if p.tag == 'li':
                    has_li = True
                    break
            else:
                has_li = False
            parent = paras[0].getparent()
            if parent.tag in ('ul', 'ol'):
                ul = parent
                parent = ul.getparent()
                idx = parent.index(ul)
                frame = DIV(ul)
            elif has_li:
                def top_level_tag(x):
                    while True:
                        q = x.getparent()
                        if q is parent or q is None:
                            break
                        x = q
                    return x
                paras = tuple(map(top_level_tag, paras))
                idx = parent.index(paras[0])
                frame = DIV(*paras)
            else:
                idx = parent.index(paras[0])
                frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = border_style.css
            self.styles.register(css, 'frame')

    def mark_block_runs(self, paras):

        def process_run(run):
            max_left = max_right = 0
            has_visible_border = None
            for p in run:
                style = self.styles.resolve_paragraph(p)
                if has_visible_border is None:
                    has_visible_border = style.has_visible_border()
                if isinstance(style.margin_left, numbers.Number):
                    max_left = max(style.margin_left, max_left)
                if isinstance(style.margin_right, numbers.Number):
                    max_right = max(style.margin_right, max_right)
                if has_visible_border:
                    style.margin_left = style.margin_right = inherit
                if p is not run[0]:
                    style.padding_top = 0
                else:
                    border_style = style.clone_border_styles()
                    if has_visible_border:
                        border_style.margin_top, style.margin_top = style.margin_top, inherit
                if p is not run[-1]:
                    style.padding_bottom = 0
                else:
                    if has_visible_border:
                        border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
                style.clear_borders()
                if p is not run[-1]:
                    style.apply_between_border()
            if has_visible_border:
                border_style.margin_left, border_style.margin_right = max_left,max_right
                self.block_runs.append((border_style, run))

        run = []
        for p in paras:
            if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
                style = self.styles.resolve_paragraph(p)
                last_style = self.styles.resolve_paragraph(run[-1])
                if style.has_identical_borders(last_style):
                    run.append(p)
                    continue
            if len(run) > 1:
                process_run(run)
            run = [p]
        if len(run) > 1:
            process_run(run)
Пример #7
0
class Convert(object):
    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.log = self.docx.log
        self.notes_text = notes_text or _("Notes")
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.styles = Styles()
        self.images = Images()
        self.tables = Tables()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset="utf-8"),
                TITLE(self.mi.title or _("Unknown")),
                LINK(rel="stylesheet", type="text/css", href="docx.css"),
            ),
            self.body,
        )
        self.html.text = "\n\t"
        self.html[0].text = "\n\t\t"
        self.html[0].tail = "\n"
        for child in self.html[0]:
            child.tail = "\n\t\t"
        self.html[0][-1].tail = "\n\t"
        self.html[1].text = self.html[1].tail = "\n"
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != "und":
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set("lang", lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set("class", "notes")
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set("class", "notes-header")
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT("[", A("←" + text, href="#back_%s" % anchor, title=text), id=anchor))
                dl[-1][0].tail = "]"
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith("}tbl"):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, "w:tbl") is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get("calibre_num_id", None)
            if raw is not None:
                lvl, num_id = raw.partition(":")[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = "\n\t"
            for child in self.body:
                child.tail = "\n\t"
            self.body[-1].tail = "\n"

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set("class", cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set("class", cls)

        if notes_header is not None:
            for h in self.body.iterchildren("h1", "h2", "h3"):
                notes_header.tag = h.tag
                cls = h.get("class", None)
                if cls and cls != "notes-header":
                    notes_header.set("class", "%s notes-header" % cls)
                break

        return self.write()

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        in_table = False

        for p in descendants(doc, "w:p", "w:tbl"):
            if p.tag.endswith("}tbl"):
                in_table = True
                self.tables.register(p)
                continue
            sect = tuple(descendants(p, "w:sectPr"))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)
            if in_table:
                if ancestor(p, "w:tbl") is not None:
                    self.tables.add(p)
                else:
                    in_table = False
        if current:
            last = XPath("./w:body/w:sectPr")(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):
        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split("/")
                cname[-1] = defname
                if self.docx.exists("/".join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, "numbering.xml")
        sname = get_name(STYLES, "styles.xml")
        fname = get_name(FONTS, "fontTable.xml")
        foname = get_name(FOOTNOTES, "footnotes.xml")
        enname = get_name(ENDNOTES, "endnotes.xml")
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn("Footnotes %s do not exist" % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn("Endnotes %s do not exist" % enname)
        footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn("Fonts table %s does not exist" % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn("Styles %s do not exist" % sname)
            else:
                self.styles(fromstring(raw), fonts)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn("Numbering styles %s do not exist" % nname)
            else:
                numbering(fromstring(raw), self.styles)

        self.styles.resolve_numbering(numbering)

    def create_toc(self):
        " Create a TOC from headings in the document "
        root = self.body
        headings = ("h1", "h2", "h3")
        tocroot = TOC()
        xpaths = [XPath("//%s" % x) for x in headings]
        level_prev = {i + 1: None for i in xrange(len(xpaths))}
        level_prev[0] = tocroot
        level_item_map = {i + 1: frozenset(xp(root)) for i, xp in enumerate(xpaths)}
        item_level_map = {e: i for i, elems in level_item_map.iteritems() for e in elems}

        self.idcount = 0

        def ensure_id(elem):
            ans = elem.get("id", None)
            if not ans:
                self.idcount += 1
                ans = "toc_id_%d" % self.idcount
                elem.set("id", ans)
            return ans

        for item in root.iterdescendants(*headings):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            toc = parent.add_item("index.html", elem_id, text)
            level_prev[lvl] = toc
            for i in xrange(lvl + 1, len(xpaths) + 1):
                level_prev[i] = None

        if len(tuple(tocroot.flat())) > 1:
            return tocroot

    def write(self):
        toc = self.create_toc()
        raw = html.tostring(self.html, encoding="utf-8", doctype="<!DOCTYPE html>")
        with open(os.path.join(self.dest_dir, "index.html"), "wb") as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, "docx.css"), "wb") as f:
                f.write(css.encode("utf-8"))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(["index.html"])
        with open(os.path.join(self.dest_dir, "metadata.opf"), "wb") as of, open(
            os.path.join(self.dest_dir, "toc.ncx"), "wb"
        ) as ncx:
            opf.render(of, ncx, "toc.ncx")
        return os.path.join(self.dest_dir, "metadata.opf")

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, "w:r", "w:bookmarkStart", "w:hyperlink"):
            if x.tag.endswith("}r"):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set("id", current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, "w:hyperlink")
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith("}bookmarkStart"):
                anchor = get(x, "w:name")
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues())
                    )
            elif x.tag.endswith("}hyperlink"):
                current_hyperlink = x

        m = re.match(r"heading\s+(\d+)$", style.style_name or "", re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = "h%d" % n

        if style.direction == "rtl":
            dest.set("dir", "rtl")

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, "text_border")
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set("class", cls)

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self, relationships_by_id):
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = "a"
            tgt = get(hyperlink, "w:tgtFrame")
            if tgt:
                span.set("target", tgt)
            tt = get(hyperlink, "w:tooltip")
            if tt:
                span.set("title", tt)
            rid = get(hyperlink, "r:id")
            if rid and rid in relationships_by_id:
                span.set("href", relationships_by_id[rid])
                continue
            anchor = get(hyperlink, "w:anchor")
            if anchor and anchor in self.anchor_map:
                span.set("href", "#" + self.anchor_map[anchor])
                continue
            self.log.warn("Hyperlink with unknown target (%s, %s), ignoring" % (rid, anchor))
            span.set("href", "#")

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, "text", [])

        for child in run:
            if is_tag(child, "w:t"):
                if not child.text:
                    continue
                space = child.get(XML("space"), None)
                if space == "preserve":
                    text.add_elem(SPAN(child.text, style="whitespace:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, "w:cr"):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, "w:br"):
                typ = child.get("type", None)
                if typ in {"column", "page"}:
                    br = BR(style="page-break-after:always")
                else:
                    clear = child.get("clear", None)
                    if clear in {"all", "left", "right"}:
                        br = BR(style="clear:%s" % ("both" if clear == "all" else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, "w:drawing") or is_tag(child, "w:pict"):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, "w:footnoteReference") or is_tag(child, "w:endnoteReference"):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href="#" + anchor, title=name), id="back_%s" % anchor)
                    l.set("class", "noteref")
                    text.add_elem(l)
                    ans.append(text.elem)
        if text.buf:
            setattr(text.elem, text.attr, "".join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {"superscript", "subscript"}:
            ans.tag = "sub" if style.vert_align == "subscript" else "sup"
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, "frame")
Пример #8
0
class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.theme = Theme()
        self.settings = Settings()
        self.tables = Tables()
        self.fields = Fields()
        self.styles = Styles(self.tables)
        self.images = Images(self.log)
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
        self.html.text='\n\t'
        self.html[0].text='\n\t\t'
        self.html[0].tail='\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        paras = []

        self.log.debug('Converting Word markup to HTML')

        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)

        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)

        for p, wp in self.object_map.iteritems():
            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    map(parent.remove, tabs)

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.fields.polish_markup(self.object_map)

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)

        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        sename = get_name(SETTINGS, 'settings.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        tname = get_name(THEMES, 'theme1.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        forel, enrel = ({}, {}), ({}, {})
        if sename is not None:
            try:
                seraw = self.docx.read(sename)
            except KeyError:
                self.log.warn('Settings %s do not exist' % sename)
            else:
                self.settings(fromstring(seraw))

        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
            else:
                forel = self.docx.get_relationships(foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
            else:
                enrel = self.docx.get_relationships(enname)
        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if tname is not None:
            try:
                raw = self.docx.read(tname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.theme(fromstring(raw))

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts, self.theme)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])

        self.styles.resolve_numbering(numbering)

    def write(self, doc):
        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log)
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        for item in opf.manifest:
            if item.media_type == 'text/html':
                item.media_type = guess_type('a.xhtml')[0]
        opf.create_spine(['index.html'])
        if self.cover_image is not None:
            opf.guide.set_cover(self.cover_image)
        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
        with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        if os.path.getsize(toc_file) == 0:
            os.remove(toc_file)
        return os.path.join(self.dest_dir, 'metadata.opf')

    def read_block_anchors(self, doc):
        doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
            current_bm = set()
            rmap = {v:k for k, v in self.object_map.iteritems()}
            for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.itervalues())))
                        for name in current_bm:
                            self.anchor_map[name] = para.get('id')
                        current_bm = set()
                elif p in doc_anchors:
                    anchor = get(p, 'w:name')
                    if anchor:
                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        def p_parent(x):
            # Ensure that nested <w:p> tags are handled. These can occur if a
            # textbox is present inside a paragraph.
            while True:
                x = x.getparent()
                try:
                    if x.tag.endswith('}p'):
                        return x
                except AttributeError:
                    break

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if p_parent(x) is not p:
                continue
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(self.anchor_map.iteritems()):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self):
        self.resolved_link_map = {}
        for hyperlink, spans in self.link_map.iteritems():
            relationships_by_id = self.link_source_map[hyperlink]
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
                          (rid, anchor))
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v:k for k, v in self.object_map.iteritems()}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink.get('url', None)
            if url is None:
                anchor = hyperlink.get('anchor', None)
                if anchor in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[anchor])
                    continue
                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
            else:
                if url in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[url])
                    continue
                span.set('href', url)

        for img, link, relationships_by_id in self.images.links:
            parent = img.getparent()
            idx = parent.index(img)
            a = A(img)
            a.tail, img.tail = img.tail, None
            parent.insert(idx, a)
            tgt = link.get('target', None)
            if tgt:
                a.set('target', tgt)
            tt = link.get('title', None)
            if tt:
                a.set('title', tt)
            rid = link['id']
            if rid in relationships_by_id:
                dest = relationships_by_id[rid]
                if dest.startswith('#'):
                    if dest[1:] in self.anchor_map:
                        a.set('href', '#' + self.anchor_map[dest[1:]])
                else:
                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                if space == 'preserve':
                    # Only use a <span> with white-space:pre-wrap if this element
                    # actually needs it, i.e. if it has more than one
                    # consecutive space or it has newlines or tabs.
                    multi_spaces = self.ms_pat.search(child.text) is not None
                    preserve = multi_spaces or self.ws_pat.search(child.text) is not None
                if preserve:
                    text.add_elem(SPAN(child.text, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif is_tag(child, 'w:tab'):
                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
                text.add_elem(SPAN(NBSP * spaces))
                ans.append(text.elem)
                ans[-1].set('class', 'tab')
            elif is_tag(child, 'w:noBreakHyphen'):
                text.buf.append(u'\u2011')
            elif is_tag(child, 'w:softHyphen'):
                text.buf.append(u'\u00ad')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed[-1].append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')
Пример #9
0
class Convert(object):
    def __init__(self,
                 path_or_stream,
                 dest_dir=None,
                 log=None,
                 notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.log = self.docx.log
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.styles = Styles()
        self.images = Images()
        self.tables = Tables()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ), self.body)
        self.html.text = '\n\t'
        self.html[0].text = '\n\t\t'
        self.html[0].tail = '\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(
                    DT('[',
                       A('←' + text, href='#back_%s' % anchor, title=text),
                       id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, 'w:tbl') is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles,
                                    self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.body.iterchildren('h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        return self.write()

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        in_table = False

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                in_table = True
                self.tables.register(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)
            if in_table:
                if ancestor(p, 'w:tbl') is not None:
                    self.tables.add(p)
                else:
                    in_table = False
        if current:
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):
        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
        footnotes(
            fromstring(foraw) if foraw else None,
            fromstring(enraw) if enraw else None)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx,
                      self.dest_dir)

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles)

        self.styles.resolve_numbering(numbering)

    def create_toc(self):
        ' Create a TOC from headings in the document '
        root = self.body
        headings = ('h1', 'h2', 'h3')
        tocroot = TOC()
        xpaths = [XPath('//%s' % x) for x in headings]
        level_prev = {i + 1: None for i in xrange(len(xpaths))}
        level_prev[0] = tocroot
        level_item_map = {
            i + 1: frozenset(xp(root))
            for i, xp in enumerate(xpaths)
        }
        item_level_map = {
            e: i
            for i, elems in level_item_map.iteritems() for e in elems
        }

        self.idcount = 0

        def ensure_id(elem):
            ans = elem.get('id', None)
            if not ans:
                self.idcount += 1
                ans = 'toc_id_%d' % self.idcount
                elem.set('id', ans)
            return ans

        for item in root.iterdescendants(*headings):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            toc = parent.add_item('index.html', elem_id, text)
            level_prev[lvl] = toc
            for i in xrange(lvl + 1, len(xpaths) + 1):
                level_prev[i] = None

        if len(tuple(tocroot.flat())) > 1:
            return tocroot

    def write(self):
        toc = self.create_toc()
        raw = html.tostring(self.html,
                            encoding='utf-8',
                            doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
        with open(os.path.join(self.dest_dir, 'metadata.opf'),
                  'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'),
                                    'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, 'w:hyperlink')
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self, relationships_by_id):
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (%s, %s), ignoring' %
                          (rid, anchor))
            span.set('href', '#')

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                if space == 'preserve':
                    text.add_elem(SPAN(child.text,
                                       style="whitespace:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = child.get('type', None)
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s' %
                                ('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page,
                                               self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(
                    child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name),
                            id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(
                self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')
Пример #10
0
class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.tables = Tables()
        self.styles = Styles(self.tables)
        self.images = Images()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
        self.html.text='\n\t'
        self.html[0].text='\n\t\t'
        self.html[0].tail='\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        return self.write()

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)

        if current:
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
        footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles)

        self.styles.resolve_numbering(numbering)

    def create_toc(self):
        ' Create a TOC from headings in the document '
        root = self.body
        headings = ('h1', 'h2', 'h3')
        tocroot = TOC()
        xpaths = [XPath('//%s' % x) for x in headings]
        level_prev = {i+1:None for i in xrange(len(xpaths))}
        level_prev[0] = tocroot
        level_item_map = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
        item_level_map = {e:i for i, elems in level_item_map.iteritems() for e in elems}

        self.idcount = 0

        def ensure_id(elem):
            ans = elem.get('id', None)
            if not ans:
                self.idcount += 1
                ans = 'toc_id_%d' % self.idcount
                elem.set('id', ans)
            return ans

        for item in descendants(root, *headings):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            toc = parent.add_item('index.html', elem_id, text)
            level_prev[lvl] = toc
            for i in xrange(lvl+1, len(xpaths)+1):
                level_prev[i] = None

        if len(tuple(tocroot.flat())) > 1:
            return tocroot

    def write(self):
        toc = self.create_toc()
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
        with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = '\xa0'
        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self, relationships_by_id):
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (%s, %s), ignoring' %
                          (rid, anchor))
            span.set('href', '#')

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                if space == 'preserve':
                    # Only use a <span> with white-space:pre-wrap if this element
                    # actually needs it, i.e. if it has more than one
                    # consecutive space or it has newlines or tabs.
                    multi_spaces = self.ms_pat.search(child.text) is not None
                    preserve = multi_spaces or self.ws_pat.search(child.text) is not None
                if preserve:
                    text.add_elem(SPAN(child.text, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif is_tag(child, 'w:fldChar') and get(child, 'w:fldCharType') == 'separate':
                text.buf.append('\xa0')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')
Пример #11
0
class Convert(object):
    def __init__(self,
                 path_or_stream,
                 dest_dir=None,
                 log=None,
                 detect_cover=True,
                 notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.theme = Theme()
        self.settings = Settings()
        self.tables = Tables()
        self.fields = Fields()
        self.styles = Styles(self.tables)
        self.images = Images(self.log)
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ), self.body)
        self.html.text = '\n\t'
        self.html[0].text = '\n\t\t'
        self.html[0].tail = '\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        paras = []

        self.log.debug('Converting Word markup to HTML')
        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(
                    DT('[',
                       A('←' + text, href='#back_%s' % anchor, title=text),
                       id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)

        for p, wp in self.object_map.iteritems():
            if len(p) > 0 and not p.text and len(
                    p[0]) > 0 and not p[0].text and p[0][0].get('class',
                                                                None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (
                        hasattr(style.text_indent, 'endswith')
                        and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    map(parent.remove, tabs)

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles,
                                    self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles,
                                          self.dest_dir, self.detect_cover)

        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):
        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        sename = get_name(SETTINGS, 'settings.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        tname = get_name(THEMES, 'theme1.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        forel, enrel = ({}, {}), ({}, {})
        if sename is not None:
            try:
                seraw = self.docx.read(sename)
            except KeyError:
                self.log.warn('Settings %s do not exist' % sename)
            else:
                self.settings(fromstring(seraw))

        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
            else:
                forel = self.docx.get_relationships(foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
            else:
                enrel = self.docx.get_relationships(enname)
        footnotes(
            fromstring(foraw) if foraw else None, forel,
            fromstring(enraw) if enraw else None, enrel)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx,
                      self.dest_dir)

        if tname is not None:
            try:
                raw = self.docx.read(tname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.theme(fromstring(raw))

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts, self.theme)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles,
                          self.docx.get_relationships(nname)[0])

        self.styles.resolve_numbering(numbering)

    def write(self, doc):
        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles,
                         self.object_map, self.log)
        raw = html.tostring(self.html,
                            encoding='utf-8',
                            doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
        if self.cover_image is not None:
            opf.guide.set_cover(self.cover_image)
        with open(os.path.join(self.dest_dir, 'metadata.opf'),
                  'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'),
                                    'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

    def read_block_anchors(self, doc):
        doc_anchors = frozenset(
            XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
            current_bm = set()
            rmap = {v: k for k, v in self.object_map.iteritems()}
            for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
                            para.set(
                                'id',
                                generate_anchor(
                                    next(iter(current_bm)),
                                    frozenset(self.anchor_map.itervalues())))
                        for name in current_bm:
                            self.anchor_map[name] = para.get('id')
                        current_bm = set()
                elif p in doc_anchors:
                    anchor = get(p, 'w:name')
                    if anchor:
                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(self.anchor_map.iteritems()):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][
                    -1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self):
        self.resolved_link_map = {}
        for hyperlink, spans in self.link_map.iteritems():
            relationships_by_id = self.link_source_map[hyperlink]
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn(
                'Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
                (rid, anchor))
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v: k for k, v in self.object_map.iteritems()}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink.get('url', None)
            if url is None:
                anchor = hyperlink.get('anchor', None)
                if anchor in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[anchor])
                    continue
                self.log.warn('Hyperlink field with unknown anchor: %s' %
                              anchor)
            else:
                if url in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[url])
                    continue
                span.set('href', url)

        for img, link in self.images.links:
            parent = img.getparent()
            idx = parent.index(img)
            a = A(img)
            a.tail, img.tail = img.tail, None
            parent.insert(idx, a)
            tgt = link.get('target', None)
            if tgt:
                a.set('target', tgt)
            tt = link.get('title', None)
            if tt:
                a.set('title', tt)
            rid = link['id']
            if rid in relationships_by_id:
                dest = relationships_by_id[rid]
                if dest.startswith('#'):
                    if dest[1:] in self.anchor_map:
                        a.set('href', '#' + self.anchor_map[dest[1:]])
                else:
                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                if space == 'preserve':
                    # Only use a <span> with white-space:pre-wrap if this element
                    # actually needs it, i.e. if it has more than one
                    # consecutive space or it has newlines or tabs.
                    multi_spaces = self.ms_pat.search(child.text) is not None
                    preserve = multi_spaces or self.ws_pat.search(
                        child.text) is not None
                if preserve:
                    text.add_elem(
                        SPAN(child.text, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s' %
                                ('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page,
                                               self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(
                    child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name),
                            id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif is_tag(child, 'w:tab'):
                spaces = int(
                    math.ceil((self.settings.default_tab_stop / 36) * 6))
                text.add_elem(SPAN(NBSP * spaces))
                ans.append(text.elem)
                ans[-1].set('class', 'tab')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(
                self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')