示例#1
0
    def drawing_to_html(self, drawing, page):
        # First process the inline pictures
        for inline in XPath('./wp:inline')(drawing):
            style, alt = get_image_properties(inline)
            for pic in XPath('descendant::pic:pic')(inline):
                ans = self.pic_to_img(pic, alt, inline)
                if ans is not None:
                    if style:
                        ans.set(
                            'style', '; '.join('%s: %s' % (k, v)
                                               for k, v in style.iteritems()))
                    yield ans

        # Now process the floats
        for anchor in XPath('./wp:anchor')(drawing):
            style, alt = get_image_properties(anchor)
            self.get_float_properties(anchor, style, page)
            for pic in XPath('descendant::pic:pic')(anchor):
                ans = self.pic_to_img(pic, alt, anchor)
                if ans is not None:
                    if style:
                        ans.set(
                            'style', '; '.join('%s: %s' % (k, v)
                                               for k, v in style.iteritems()))
                    yield ans
示例#2
0
def get_applicable_xe_fields(index, xe_fields):
    iet = index.get('entry-type', None)
    xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]

    lr = index.get('letter-range', None)
    if lr is not None:
        sl, el = lr.parition('-')[0::2]
        sl, el = sl.strip(), el.strip()
        if sl and el:

            def inrange(text):
                return sl <= text[0] <= el

            xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]

    bmark = index.get('bookmark', None)
    if bmark is None:
        return xe_fields
    attr = expand('w:name')
    bookmarks = {
        b
        for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem'])
        if b.get(attr, None) == bmark
    }
    ancestors = XPath('ancestor::w:bookmarkStart')

    def contained(xe):
        # Check if the xe field is contained inside a bookmark with the
        # specified name
        return bool(set(ancestors(xe['start_elem'])) & bookmarks)

    return [xe for xe in xe_fields if contained(xe)]
示例#3
0
def get_image_properties(parent):
    width = height = None
    for extent in XPath('./wp:extent')(parent):
        try:
            width = emu_to_pt(int(extent.get('cx')))
        except (TypeError, ValueError):
            pass
        try:
            height = emu_to_pt(int(extent.get('cy')))
        except (TypeError, ValueError):
            pass
    ans = {}
    if width is not None:
        ans['width'] = '%.3gpt' % width
    if height is not None:
        ans['height'] = '%.3gpt' % height

    alt = None
    for docPr in XPath('./wp:docPr')(parent):
        x = docPr.get('descr', None)
        if x:
            alt = x
        if docPr.get('hidden', None) in {'true', 'on', '1'}:
            ans['display'] = 'none'

    return ans, alt
示例#4
0
    def pic_to_img(self, pic, alt, parent):
        name = None
        link = None
        for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
            link = {'id':get(hl, 'r:id')}
            tgt = hl.get('tgtFrame', None)
            if tgt:
                link['target'] = tgt
            title = hl.get('tooltip', None)
            if title:
                link['title'] = title

        for pr in XPath('descendant::pic:cNvPr')(pic):
            name = pr.get('name', None)
            if name:
                name = ascii_filename(name).replace(' ', '_')
            alt = pr.get('descr', None)
            for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
                rid = get(a, 'r:embed')
                if not rid:
                    rid = get(a, 'r:link')
                if rid and rid in self.rid_map:
                    try:
                        src = self.generate_filename(rid, name)
                    except LinkedImageNotFound as err:
                        self.log.warn('Linked image: %s not found, ignoring' % err.fname)
                        continue
                    img = IMG(src='images/%s' % src)
                    img.set('alt', alt or 'Image')
                    if link is not None:
                        self.links.append((img, link))
                    return img
示例#5
0
 def create_instance(n, definition):
     nd = definition.copy()
     start_overrides = {}
     for lo in XPath('./w:lvlOverride')(n):
         try:
             ilvl = int(get(lo, 'w:ilvl'))
         except (ValueError, TypeError):
             ilvl = None
         for so in XPath('./w:startOverride[@w:val]')(lo):
             try:
                 start_override = int(get(so, 'w:val'))
             except (TypeError, ValueError):
                 pass
             else:
                 start_overrides[ilvl] = start_override
         for lvl in XPath('./w:lvl')(lo)[:1]:
             nilvl = get(lvl, 'w:ilvl')
             ilvl = nilvl if ilvl is None else ilvl
             alvl = nd.levels.get(ilvl, None)
             if alvl is None:
                 alvl = Level()
             alvl.read_from_xml(lvl, override=True)
     for ilvl, so in start_overrides.iteritems():
         try:
             nd.levels[ilvl].start = start_override
         except KeyError:
             pass
     return nd
示例#6
0
def get_hpos(anchor, page_width):
    for ph in XPath('./wp:positionH')(anchor):
        rp = ph.get('relativeFrom', None)
        if rp == 'leftMargin':
            return 0
        if rp == 'rightMargin':
            return 1
        for align in XPath('./wp:align')(ph):
            al = align.text
            if al == 'left':
                return 0
            if al == 'center':
                return 0.5
            if al == 'right':
                return 1
        for po in XPath('./wp:posOffset')(ph):
            try:
                pos = emu_to_pt(int(po.text))
            except (TypeError, ValueError):
                continue
            return pos/page_width

    for sp in XPath('./wp:simplePos')(anchor):
        try:
            x = emu_to_pt(sp.get('x', None))
        except (TypeError, ValueError):
            continue
        return x/page_width

    return 0
示例#7
0
文件: images.py 项目: sss/calibre
    def pic_to_img(self, pic, alt, parent):
        name = None
        link = None
        for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
            link = {'id':get(hl, 'r:id')}
            tgt = hl.get('tgtFrame', None)
            if tgt:
                link['target'] = tgt
            title = hl.get('tooltip', None)
            if title:
                link['title'] = title

        for pr in XPath('descendant::pic:cNvPr')(pic):
            name = pr.get('name', None)
            if name:
                name = ascii_filename(name).replace(' ', '_')
            alt = pr.get('descr', None)
            for a in XPath('descendant::a:blip[@r:embed]')(pic):
                rid = get(a, 'r:embed')
                if rid in self.rid_map:
                    src = self.generate_filename(rid, name)
                    img = IMG(src='images/%s' % src)
                    img.set('alt', alt or 'Image')
                    if link is not None:
                        self.links.append((img, link))
                    return img
示例#8
0
 def apply_markup(self, rmap, parent=None):
     table = TABLE('\n\t\t')
     if parent is None:
         try:
             first_para = rmap[next(iter(self))]
         except StopIteration:
             return
         parent = first_para.getparent()
         idx = parent.index(first_para)
         parent.insert(idx, table)
     else:
         parent.append(table)
     for row in XPath('./w:tr')(self.tbl):
         tr = TR('\n\t\t\t')
         tr.tail = '\n\t\t'
         table.append(tr)
         for tc in XPath('./w:tc')(row):
             td = TD()
             td.tail = '\n\t\t\t'
             tr.append(td)
             for x in XPath('./w:p|./w:tbl')(tc):
                 if x.tag.endswith('}p'):
                     td.append(rmap[x])
                 else:
                     self.sub_tables[x].apply_markup(rmap, parent=td)
         if len(tr):
             tr[-1].tail = '\n\t\t'
     if len(table):
         table[-1].tail = '\n\t'
示例#9
0
 def __call__(self, root):
     for fs in XPath('//a:fontScheme')(root):
         for mj in XPath('./a:majorFont')(fs):
             for l in XPath('./a:latin[@typeface]')(mj):
                 self.major_latin_font = l.get('typeface')
         for mj in XPath('./a:minorFont')(fs):
             for l in XPath('./a:latin[@typeface]')(mj):
                 self.minor_latin_font = l.get('typeface')
示例#10
0
def read_padding(parent, dest):
    name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
    ans = {x:inherit for x in edges}
    for mar in XPath('./w:%s' % name)(parent):
        for x in edges:
            for edge in XPath('./w:%s' % x)(mar):
                ans[x] = _read_width(edge)
    for x in edges:
        setattr(dest, 'cell_padding_%s' % x, ans[x])
示例#11
0
def read_padding(parent, dest):
    name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
    left = top = bottom = right = inherit
    for mar in XPath('./w:%s' % name)(parent):
        for x in ('left', 'top', 'right', 'bottom'):
            for edge in XPath('./w:%s' % x)(mar):
                locals()[x] = _read_width(edge)
    for x in ('left', 'top', 'right', 'bottom'):
        setattr(dest, 'cell_padding_%s' % x, locals()[x])
示例#12
0
    def __call__(self, doc, log):
        all_ids = frozenset(XPath('//*/@w:id')(doc))
        c = 0
        while self.index_bookmark_prefix in all_ids:
            c += 1
            self.index_bookmark_prefix = self.index_bookmark_prefix.replace(
                '-', '%d-' % c)
        stack = []
        for elem in XPath(
                '//*[name()="w:p" or name()="w:r" or name()="w:instrText" or (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end"))]'
        )(doc):
            if elem.tag.endswith('}fldChar'):
                typ = get(elem, 'w:fldCharType')
                if typ == 'begin':
                    stack.append(Field(elem))
                    self.fields.append(stack[-1])
                else:
                    try:
                        stack.pop().end = elem
                    except IndexError:
                        pass
            elif elem.tag.endswith('}instrText'):
                if stack:
                    stack[-1].add_instr(elem)
            else:
                if stack:
                    stack[-1].contents.append(elem)

        field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref')
        parsers = {x.upper(): getattr(self, 'parse_' + x) for x in field_types}
        parsers.update({x: getattr(self, 'parse_' + x) for x in field_types})
        field_parsers = {
            f.upper(): globals()['parse_%s' % f]
            for f in field_types
        }
        field_parsers.update(
            {f: globals()['parse_%s' % f]
             for f in field_types})

        for f in field_types:
            setattr(self, '%s_fields' % f, [])
        unknown_fields = {
            'TOC', 'toc', 'PAGEREF', 'pageref'
        }  # The TOC and PAGEREF fields are handled separately

        for field in self.fields:
            field.finalize()
            if field.instructions:
                func = parsers.get(field.name, None)
                if func is not None:
                    func(field, field_parsers[field.name], log)
                elif field.name not in unknown_fields:
                    log.warn('Encountered unknown field: %s, ignoring it.' %
                             field.name)
                    unknown_fields.add(field.name)
示例#13
0
def read_numbering(parent, dest):
    lvl = num_id = None
    for np in XPath('./w:numPr')(parent):
        for ilvl in XPath('./w:ilvl[@w:val]')(np):
            try:
                lvl = int(get(ilvl, 'w:val'))
            except (ValueError, TypeError):
                pass
        for num in XPath('./w:numId[@w:val]')(np):
            num_id = get(num, 'w:val')
    val = (num_id, lvl) if num_id is not None or lvl is not None else inherit
    setattr(dest, 'numbering', val)
示例#14
0
    def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
        if footnotes is not None:
            for footnote in XPath('./w:footnote[@w:id]')(footnotes):
                fid = get(footnote, 'w:id')
                if fid:
                    self.footnotes[fid] = Note(footnote, footnotes_rels)

        if endnotes is not None:
            for endnote in XPath('./w:endnote[@w:id]')(endnotes):
                fid = get(endnote, 'w:id')
                if fid:
                    self.endnotes[fid] = Note(endnote, endnotes_rels)
示例#15
0
 def create_instance(n, definition):
     nd = definition.copy()
     for lo in XPath('./w:lvlOverride')(n):
         ilvl = get(lo, 'w:ilvl')
         for lvl in XPath('./w:lvl')(lo)[:1]:
             nilvl = get(lvl, 'w:ilvl')
             ilvl = nilvl if ilvl is None else ilvl
             alvl = nd.levels.get(ilvl, None)
             if alvl is None:
                 alvl = Level()
             alvl.read_from_xml(lvl, override=True)
     return nd
示例#16
0
    def __call__(self, root, styles):
        ' Read all numbering style definitions '
        lazy_load = {}
        for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
            an_id = get(an, 'w:abstractNumId')
            nsl = XPath('./w:numStyleLink[@w:val]')(an)
            if nsl:
                lazy_load[an_id] = get(nsl[0], 'w:val')
            else:
                nd = NumberingDefinition(an)
                self.definitions[an_id] = nd

        def create_instance(n, definition):
            nd = definition.copy()
            for lo in XPath('./w:lvlOverride')(n):
                ilvl = get(lo, 'w:ilvl')
                for lvl in XPath('./w:lvl')(lo)[:1]:
                    nilvl = get(lvl, 'w:ilvl')
                    ilvl = nilvl if ilvl is None else ilvl
                    alvl = nd.levels.get(ilvl, None)
                    if alvl is None:
                        alvl = Level()
                    alvl.read_from_xml(lvl, override=True)
            return nd

        next_pass = {}
        for n in XPath('./w:num[@w:numId]')(root):
            an_id = None
            num_id = get(n, 'w:numId')
            for an in XPath('./w:abstractNumId[@w:val]')(n):
                an_id = get(an, 'w:val')
            d = self.definitions.get(an_id, None)
            if d is None:
                next_pass[num_id] = (an_id, n)
                continue
            self.instances[num_id] = create_instance(n, d)

        numbering_links = styles.numbering_style_links
        for an_id, style_link in lazy_load.iteritems():
            num_id = numbering_links[style_link]
            self.definitions[an_id] = self.instances[num_id].copy()

        for num_id, (an_id, n) in next_pass.iteritems():
            d = self.definitions.get(an_id, None)
            if d is not None:
                self.instances[num_id] = create_instance(n, d)

        for num_id, d in self.instances.iteritems():
            self.counters[num_id] = Counter({lvl:d.levels[lvl].start for lvl in d.levels})
示例#17
0
 def pic_to_img(self, pic, alt=None):
     name = None
     for pr in XPath('descendant::pic:cNvPr')(pic):
         name = pr.get('name', None)
         if name:
             name = ascii_filename(name).replace(' ', '_')
         alt = pr.get('descr', None)
         for a in XPath('descendant::a:blip[@r:embed]')(pic):
             rid = get(a, 'r:embed')
             if rid in self.rid_map:
                 src = self.generate_filename(rid, name)
                 img = IMG(src='images/%s' % src)
                 if alt:
                     img(alt=alt)
                 return img
示例#18
0
    def resolve_run(self, r):
        ans = self.run_cache.get(r, None)
        if ans is None:
            p = XPath('ancestor::w:p[1]')(r)
            p = p[0] if p else None
            ans = self.run_cache[r] = RunStyle()
            direct_formatting = None
            for rPr in XPath('./w:rPr')(r):
                rs = RunStyle(rPr)
                if direct_formatting is None:
                    direct_formatting = rs
                else:
                    direct_formatting.update(rs)

            if direct_formatting is None:
                direct_formatting = RunStyle()

            parent_styles = []
            default_char = self.default_styles.get('character', None)
            if self.default_character_style is not None:
                parent_styles.append(self.default_character_style)
            pstyle = self.para_char_cache.get(p, None)
            if pstyle is not None:
                parent_styles.append(pstyle)
            # As best as I can understand the spec, table overrides should be
            # applied before paragraph overrides, but word does it
            # this way, see the December 2007 table header in the demo
            # document.
            ts = self.tables.run_style(p)
            if ts is not None:
                parent_styles.append(ts)
            if direct_formatting.linked_style is not None:
                ls = getattr(self.get(direct_formatting.linked_style),
                             'character_style', None)
                if ls is not None:
                    parent_styles.append(ls)
            elif default_char is not None and default_char.character_style is not None:
                parent_styles.append(default_char.character_style)

            for attr in ans.all_properties:
                setattr(ans, attr,
                        self.run_val(parent_styles, direct_formatting, attr))

            if ans.font_family is not inherit:
                ff = self.theme.resolve_font_family(ans.font_family)
                ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)

        return ans
示例#19
0
def read_shd(parent, dest):
    ans = inherit
    for shd in XPath('./w:shd[@w:fill]')(parent):
        val = get(shd, 'w:fill')
        if val:
            ans = simple_color(val, auto='transparent')
    setattr(dest, 'background_color', ans)
示例#20
0
def read_indent(parent, dest):
    padding_left = padding_right = text_indent = inherit
    for indent in XPath('./w:ind')(parent):
        l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
        pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
        if pl is not None:
            padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')

        r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
        pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
        if pr is not None:
            padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')

        h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
        fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
        h = h if h is None else '-'+h
        hc = hc if hc is None else '-'+hc
        ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
              simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
        if ti is not None:
            text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')

    setattr(dest, 'margin_left', padding_left)
    setattr(dest, 'margin_right', padding_right)
    setattr(dest, 'text_indent', text_indent)
示例#21
0
    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr
示例#22
0
def read_default_style_language(raw, mi):
    root = fromstring(raw)
    for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
        lang = canonicalize_lang(lang)
        if lang:
            mi.languages = [lang]
            break
示例#23
0
def read_underline(parent, dest):
    ans = inherit
    for col in XPath('./w:u[@w:val]')(parent):
        val = get(col, 'w:val')
        if val:
            ans = val if val == 'none' else 'underline'
    setattr(dest, 'text_decoration', ans)
示例#24
0
    def pict_to_html(self, pict, page):
        # First see if we have an <hr>
        is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
        if is_hr:
            style = {}
            hr = HR()
            try:
                pct = float(get(pict[0], 'o:hrpct'))
            except (ValueError, TypeError, AttributeError):
                pass
            else:
                if pct > 0:
                    style['width'] = '%.3g%%' % pct
            align = get(pict[0], 'o:hralign', 'center')
            if align in {'left', 'right'}:
                style['margin-left'] = '0' if align == 'left' else 'auto'
                style['margin-right'] = 'auto' if align == 'left' else '0'
            if style:
                hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in style.iteritems())))
            yield hr

        for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
            rid = get(imagedata, 'r:id')
            if rid in self.rid_map:
                try:
                    src = self.generate_filename(rid)
                except LinkedImageNotFound as err:
                    self.log.warn('Linked image: %s not found, ignoring' % err.fname)
                    continue
                img = IMG(src='images/%s' % src, style="display:block")
                alt = get(imagedata, 'o:title')
                img.set('alt', alt or 'Image')
                yield img
示例#25
0
def read_letter_spacing(parent, dest):
    ans = inherit
    for col in XPath('./w:spacing[@w:val]')(parent):
        val = simple_float(get(col, 'w:val'), 0.05)
        if val is not None:
            ans = val
    setattr(dest, 'letter_spacing', ans)
示例#26
0
    def __init__(self, rPr=None):
        self.linked_style = None
        if rPr is None:
            for p in self.all_properties:
                setattr(self, p, inherit)
        else:
            for p in (
                    'b',
                    'bCs',
                    'caps',
                    'cs',
                    'dstrike',
                    'emboss',
                    'i',
                    'iCs',
                    'imprint',
                    'rtl',
                    'shadow',
                    'smallCaps',
                    'strike',
                    'vanish',
                    'webHidden',
            ):
                setattr(self, p, binary_property(rPr, p))

            for x in ('text_border', 'color', 'highlight', 'shd',
                      'letter_spacing', 'sz', 'underline', 'vert_align',
                      'lang', 'font_family'):
                f = globals()['read_%s' % x]
                f(rPr, self)

            for s in XPath('./w:rStyle[@w:val]')(rPr):
                self.linked_style = get(s, 'w:val')

        self._css = None
示例#27
0
def read_vert_align(parent, dest):
    ans = inherit
    for col in XPath('./w:vertAlign[@w:val]')(parent):
        val = get(col, 'w:val')
        if val and val in {'baseline', 'subscript', 'superscript'}:
            ans = val
    setattr(dest, 'vert_align', ans)
示例#28
0
def read_single_border(parent, edge):
    color = style = width = padding = None
    for elem in XPath('./w:%s' % edge)(parent):
        c = get(elem, 'w:color')
        if c is not None:
            color = simple_color(c)
        s = get(elem, 'w:val')
        if s is not None:
            style = LINE_STYLES.get(s, 'solid')
        space = get(elem, 'w:space')
        if space is not None:
            try:
                padding = float(space)
            except (ValueError, TypeError):
                pass
        sz = get(elem, 'w:sz')
        if sz is not None:
            # we dont care about art borders (they are only used for page borders)
            try:
                # WebKit needs at least 1pt to render borders
                width = min(96, max(8, float(sz))) / 8
            except (ValueError, TypeError):
                pass
    if style == 'double' and width is not None and 0 < width < 3:
        width = 3  # WebKit needs 3pts to render double borders
    return {p:v for p, v in zip(border_props, (padding, width, style, color))}
示例#29
0
def read_text_border(parent, dest):
    border_color = border_style = border_width = padding = inherit
    elems = XPath('./w:bdr')(parent)
    if elems:
        border_color = simple_color('auto')
        border_style = 'solid'
        border_width = 1
    for elem in elems:
        color = get(elem, 'w:color')
        if color is not None:
            border_color = simple_color(color)
        style = get(elem, 'w:val')
        if style is not None:
            border_style = LINE_STYLES.get(style, 'solid')
        space = get(elem, 'w:space')
        if space is not None:
            try:
                padding = float(space)
            except (ValueError, TypeError):
                pass
        sz = get(elem, 'w:sz')
        if sz is not None:
            # we dont care about art borders (they are only used for page borders)
            try:
                # A border of less than 1pt is not rendered by WebKit
                border_width = min(96, max(8, float(sz))) / 8
            except (ValueError, TypeError):
                pass

    setattr(dest, 'border_color', border_color)
    setattr(dest, 'border_style', border_style)
    setattr(dest, 'border_width', border_width)
    setattr(dest, 'padding', padding)
示例#30
0
def read_sz(parent, dest):
    ans = inherit
    for col in XPath('./w:sz[@w:val]')(parent):
        val = simple_float(get(col, 'w:val'), 0.5)
        if val is not None:
            ans = val
    setattr(dest, 'font_size', ans)