def pic_to_img(self, pic, alt, parent): name = None link = None for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent): link = {'id':get(hl, 'r:id')} tgt = hl.get('tgtFrame', None) if tgt: link['target'] = tgt title = hl.get('tooltip', None) if title: link['title'] = title for pr in XPath('descendant::pic:cNvPr')(pic): name = pr.get('name', None) if name: name = ascii_filename(name).replace(' ', '_') alt = pr.get('descr', None) for a in XPath('descendant::a:blip[@r:embed]')(pic): rid = get(a, 'r:embed') if rid in self.rid_map: src = self.generate_filename(rid, name) img = IMG(src='images/%s' % src) img.set('alt', alt or 'Image') if link is not None: self.links.append((img, link)) return img
def create_instance(n, definition): nd = definition.copy() start_overrides = {} for lo in XPath('./w:lvlOverride')(n): try: ilvl = int(get(lo, 'w:ilvl')) except (ValueError, TypeError): ilvl = None for so in XPath('./w:startOverride[@w:val]')(lo): try: start_override = int(get(so, 'w:val')) except (TypeError, ValueError): pass else: start_overrides[ilvl] = start_override for lvl in XPath('./w:lvl')(lo)[:1]: nilvl = get(lvl, 'w:ilvl') ilvl = nilvl if ilvl is None else ilvl alvl = nd.levels.get(ilvl, None) if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) for ilvl, so in start_overrides.iteritems(): try: nd.levels[ilvl].start = start_override except KeyError: pass return nd
def read_text_border(parent, dest): border_color = border_style = border_width = padding = inherit elems = XPath('./w:bdr')(parent) if elems and elems[0].attrib: border_color = simple_color('auto') border_style = 'solid' border_width = 1 for elem in elems: color = get(elem, 'w:color') if color is not None: border_color = simple_color(color) style = get(elem, 'w:val') if style is not None: border_style = LINE_STYLES.get(style, 'solid') space = get(elem, 'w:space') if space is not None: try: padding = float(space) except (ValueError, TypeError): pass sz = get(elem, 'w:sz') if sz is not None: # we dont care about art borders (they are only used for page borders) try: # A border of less than 1pt is not rendered by WebKit border_width = min(96, max(8, float(sz))) / 8 except (ValueError, TypeError): pass setattr(dest, 'border_color', border_color) setattr(dest, 'border_style', border_style) setattr(dest, 'border_width', border_width) setattr(dest, 'padding', padding)
def read_text_border(parent, dest): border_color = border_style = border_width = padding = inherit elems = XPath("./w:bdr")(parent) if elems: border_color = simple_color("auto") border_style = "solid" border_width = 1 for elem in elems: color = get(elem, "w:color") if color is not None: border_color = simple_color(color) style = get(elem, "w:val") if style is not None: border_style = LINE_STYLES.get(style, "solid") space = get(elem, "w:space") if space is not None: try: padding = float(space) except (ValueError, TypeError): pass sz = get(elem, "w:sz") if sz is not None: # we dont care about art borders (they are only used for page borders) try: # A border of less than 1pt is not rendered by WebKit border_width = min(96, max(8, float(sz))) / 8 except (ValueError, TypeError): pass setattr(dest, "border_color", border_color) setattr(dest, "border_style", border_style) setattr(dest, "border_width", border_width) setattr(dest, "padding", padding)
def read_text_border(parent, dest): border_color = border_style = border_width = padding = inherit elems = XPath('./w:bdr')(parent) if elems: border_color = simple_color('auto') border_style = 'solid' border_width = 1 for elem in elems: color = get(elem, 'w:color') if color is not None: border_color = simple_color(color) style = get(elem, 'w:val') if style is not None: border_style = LINE_STYLES.get(style, 'solid') space = get(elem, 'w:space') if space is not None: try: padding = float(space) except (ValueError, TypeError): pass sz = get(elem, 'w:sz') if sz is not None: # we dont care about art borders (they are only used for page borders) try: # A border of less than 1pt is not rendered by WebKit border_width = min(96, max(8, float(sz))) / 8 except (ValueError, TypeError): pass setattr(dest, 'border_color', border_color) setattr(dest, 'border_style', border_style) setattr(dest, 'border_width', border_width) setattr(dest, 'padding', padding)
def pic_to_img(self, pic, alt, parent): name = None link = None for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent): link = {'id':get(hl, 'r:id')} tgt = hl.get('tgtFrame', None) if tgt: link['target'] = tgt title = hl.get('tooltip', None) if title: link['title'] = title for pr in XPath('descendant::pic:cNvPr')(pic): name = pr.get('name', None) if name: name = ascii_filename(name).replace(' ', '_') alt = pr.get('descr', None) for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic): rid = get(a, 'r:embed') if not rid: rid = get(a, 'r:link') if rid and rid in self.rid_map: try: src = self.generate_filename(rid, name) except LinkedImageNotFound as err: self.log.warn('Linked image: %s not found, ignoring' % err.fname) continue img = IMG(src='images/%s' % src) img.set('alt', alt or 'Image') if link is not None: self.links.append((img, link)) return img
def pict_to_html(self, pict, page): # First see if we have an <hr> is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'} if is_hr: style = {} hr = HR() try: pct = float(get(pict[0], 'o:hrpct')) except (ValueError, TypeError, AttributeError): pass else: if pct > 0: style['width'] = '%.3g%%' % pct align = get(pict[0], 'o:hralign', 'center') if align in {'left', 'right'}: style['margin-left'] = '0' if align == 'left' else 'auto' style['margin-right'] = 'auto' if align == 'left' else '0' if style: hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in style.iteritems()))) yield hr for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict): rid = get(imagedata, 'r:id') if rid in self.rid_map: try: src = self.generate_filename(rid) except LinkedImageNotFound as err: self.log.warn('Linked image: %s not found, ignoring' % err.fname) continue img = IMG(src='images/%s' % src, style="display:block") alt = get(imagedata, 'o:title') img.set('alt', alt or 'Image') yield img
def read_single_border(parent, edge): color = style = width = padding = None for elem in XPath('./w:%s' % edge)(parent): c = get(elem, 'w:color') if c is not None: color = simple_color(c) s = get(elem, 'w:val') if s is not None: style = LINE_STYLES.get(s, 'solid') space = get(elem, 'w:space') if space is not None: try: padding = float(space) except (ValueError, TypeError): pass sz = get(elem, 'w:sz') if sz is not None: # we dont care about art borders (they are only used for page borders) try: # WebKit needs at least 1pt to render borders width = min(96, max(8, float(sz))) / 8 except (ValueError, TypeError): pass if style == 'double' and width is not None and 0 < width < 3: width = 3 # WebKit needs 3pts to render double borders return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def convert_run(self, run): ans = SPAN() self.object_map[ans] = run text = Text(ans, 'text', []) for child in run: if is_tag(child, 'w:t'): if not child.text: continue space = child.get(XML('space'), None) preserve = False if space == 'preserve': # Only use a <span> with white-space:pre-wrap if this element # actually needs it, i.e. if it has more than one # consecutive space or it has newlines or tabs. multi_spaces = self.ms_pat.search(child.text) is not None preserve = multi_spaces or self.ws_pat.search(child.text) is not None if preserve: text.add_elem(SPAN(child.text, style="white-space:pre-wrap")) ans.append(text.elem) else: text.buf.append(child.text) elif is_tag(child, 'w:cr'): text.add_elem(BR()) ans.append(text.elem) elif is_tag(child, 'w:br'): typ = get(child, 'w:type') if typ in {'column', 'page'}: br = BR(style='page-break-after:always') else: clear = child.get('clear', None) if clear in {'all', 'left', 'right'}: br = BR(style='clear:%s'%('both' if clear == 'all' else clear)) else: br = BR() text.add_elem(br) ans.append(text.elem) elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'): for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): text.add_elem(img) ans.append(text.elem) elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'): anchor, name = self.footnotes.get_ref(child) if anchor and name: l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor) l.set('class', 'noteref') text.add_elem(l) ans.append(text.elem) elif is_tag(child, 'w:fldChar') and get(child, 'w:fldCharType') == 'separate': text.buf.append('\xa0') if text.buf: setattr(text.elem, text.attr, ''.join(text.buf)) style = self.styles.resolve_run(run) if style.vert_align in {'superscript', 'subscript'}: ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup' if style.lang is not inherit: ans.lang = style.lang return ans
def read_height(parent, dest): ans = inherit for rh in XPath('./w:trHeight')(parent): rule = get(rh, 'w:hRule', 'auto') if rule in {'auto', 'atLeast', 'exact'}: val = get(rh, 'w:val') ans = (rule, val) setattr(dest, 'height', ans)
def pict_to_html(self, pict, page): for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict): rid = get(imagedata, 'r:id') if rid in self.rid_map: src = self.generate_filename(rid) img = IMG(src='images/%s' % src, style="display:block") alt = get(imagedata, 'o:title') if alt: img.set('alt', alt) yield img
def read_font_family(parent, dest): ans = inherit for col in XPath('./w:rFonts')(parent): val = get(col, 'w:asciiTheme') if val: val = '|%s|' % val else: val = get(col, 'w:ascii') if val: ans = val setattr(dest, 'font_family', ans)
def read_font_family(parent, dest): ans = inherit for col in XPath("./w:rFonts")(parent): val = get(col, "w:asciiTheme") if val: val = "|%s|" % val else: val = get(col, "w:ascii") if val: ans = val setattr(dest, "font_family", ans)
def create_instance(n, definition): nd = definition.copy() for lo in XPath('./w:lvlOverride')(n): ilvl = get(lo, 'w:ilvl') for lvl in XPath('./w:lvl')(lo)[:1]: nilvl = get(lvl, 'w:ilvl') ilvl = nilvl if ilvl is None else ilvl alvl = nd.levels.get(ilvl, None) if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) return nd
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels): if footnotes is not None: for footnote in XPath('./w:footnote[@w:id]')(footnotes): fid = get(footnote, 'w:id') if fid: self.footnotes[fid] = Note(footnote, footnotes_rels) if endnotes is not None: for endnote in XPath('./w:endnote[@w:id]')(endnotes): fid = get(endnote, 'w:id') if fid: self.endnotes[fid] = Note(endnote, endnotes_rels)
def __call__(self, footnotes, endnotes): if footnotes is not None: for footnote in XPath('./w:footnote[@w:id]')(footnotes): fid = get(footnote, 'w:id') if fid: self.footnotes[fid] = Note(footnote) if endnotes is not None: for endnote in XPath('./w:endnote[@w:id]')(endnotes): fid = get(endnote, 'w:id') if fid: self.endnotes[fid] = Note(endnote)
def read_numbering(parent, dest): lvl = num_id = None for np in XPath('./w:numPr')(parent): for ilvl in XPath('./w:ilvl[@w:val]')(np): try: lvl = int(get(ilvl, 'w:val')) except (ValueError, TypeError): pass for num in XPath('./w:numId[@w:val]')(np): num_id = get(num, 'w:val') val = (num_id, lvl) if num_id is not None or lvl is not None else inherit setattr(dest, 'numbering', val)
def get_cover(docx): doc = docx.document rid_map = docx.document_relationships[0] for image in images(doc): rid = get(image, 'r:embed') or get(image, 'r:id') if rid in rid_map: try: raw = docx.read(rid_map[rid]) width, height, fmt = identify_data(raw) except Exception: continue if 0.8 <= height/width <= 1.8 and height*width >= 160000: return (fmt, raw)
def read_indent(parent, dest): padding_left = padding_right = text_indent = inherit for indent in XPath('./w:ind')(parent): l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars') pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None if pl is not None: padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt') r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars') pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None if pr is not None: padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt') h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars') fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars') h = h if h is None else '-'+h hc = hc if hc is None else '-'+hc ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None) if ti is not None: text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt') setattr(dest, 'margin_left', padding_left) setattr(dest, 'margin_right', padding_right) setattr(dest, 'text_indent', text_indent)
def read_spacing(parent, dest): padding_top = padding_bottom = line_height = inherit for s in XPath('./w:spacing')(parent): a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get( s, 'w:afterAutospacing') pb = None if aa in { 'on', '1', 'true' } else simple_float(al, 0.02) if al is not None else simple_float( a, 0.05) if a is not None else None if pb is not None: padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt') b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get( s, 'w:beforeAutospacing') pt = None if bb in { 'on', '1', 'true' } else simple_float(bl, 0.02) if bl is not None else simple_float( b, 0.05) if b is not None else None if pt is not None: padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt') l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto') if l is not None: lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast' } else simple_float( l, 1 / 240.0) line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '') setattr(dest, 'margin_top', padding_top) setattr(dest, 'margin_bottom', padding_bottom) setattr(dest, 'line_height', line_height)
def __call__(self, root, styles): ' Read all numbering style definitions ' lazy_load = {} for an in XPath('./w:abstractNum[@w:abstractNumId]')(root): an_id = get(an, 'w:abstractNumId') nsl = XPath('./w:numStyleLink[@w:val]')(an) if nsl: lazy_load[an_id] = get(nsl[0], 'w:val') else: nd = NumberingDefinition(an) self.definitions[an_id] = nd def create_instance(n, definition): nd = definition.copy() for lo in XPath('./w:lvlOverride')(n): ilvl = get(lo, 'w:ilvl') for lvl in XPath('./w:lvl')(lo)[:1]: nilvl = get(lvl, 'w:ilvl') ilvl = nilvl if ilvl is None else ilvl alvl = nd.levels.get(ilvl, None) if alvl is None: alvl = Level() alvl.read_from_xml(lvl, override=True) return nd next_pass = {} for n in XPath('./w:num[@w:numId]')(root): an_id = None num_id = get(n, 'w:numId') for an in XPath('./w:abstractNumId[@w:val]')(n): an_id = get(an, 'w:val') d = self.definitions.get(an_id, None) if d is None: next_pass[num_id] = (an_id, n) continue self.instances[num_id] = create_instance(n, d) numbering_links = styles.numbering_style_links for an_id, style_link in lazy_load.iteritems(): num_id = numbering_links[style_link] self.definitions[an_id] = self.instances[num_id].copy() for num_id, (an_id, n) in next_pass.iteritems(): d = self.definitions.get(an_id, None) if d is not None: self.instances[num_id] = create_instance(n, d) for num_id, d in self.instances.iteritems(): self.counters[num_id] = Counter({lvl:d.levels[lvl].start for lvl in d.levels})
def read_letter_spacing(parent, dest): ans = inherit for col in XPath("./w:spacing[@w:val]")(parent): val = simple_float(get(col, "w:val"), 0.05) if val is not None: ans = val setattr(dest, "letter_spacing", ans)
def __init__(self, rPr=None): self.linked_style = None if rPr is None: for p in self.all_properties: setattr(self, p, inherit) else: for p in ( 'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden', ): setattr(self, p, binary_property(rPr, p)) for x in ('text_border', 'color', 'highlight', 'shd', 'letter_spacing', 'sz', 'underline', 'vert_align', 'lang', 'font_family'): f = globals()['read_%s' % x] f(rPr, self) for s in XPath('./w:rStyle[@w:val]')(rPr): self.linked_style = get(s, 'w:val') self._css = None
def read_sz(parent, dest): ans = inherit for col in XPath('./w:sz[@w:val]')(parent): val = simple_float(get(col, 'w:val'), 0.5) if val is not None: ans = val setattr(dest, 'font_size', ans)
def read_font_family(parent, dest): ans = inherit for col in XPath('./w:rFonts[@w:ascii]')(parent): val = get(col, 'w:ascii') if val: ans = val setattr(dest, 'font_family', ans)
def read_vert_align(parent, dest): ans = inherit for col in XPath('./w:vertAlign[@w:val]')(parent): val = get(col, 'w:val') if val and val in {'baseline', 'subscript', 'superscript'}: ans = val setattr(dest, 'vert_align', ans)
def read_underline(parent, dest): ans = inherit for col in XPath('./w:u[@w:val]')(parent): val = get(col, 'w:val') if val: ans = val if val == 'none' else 'underline' setattr(dest, 'text_decoration', ans)
def __init__(self, tblPr=None): if tblPr is None: for p in self.all_properties: setattr(self, p, inherit) else: self.overrides = inherit for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'): f = globals()['read_%s' % x] f(tblPr, self) parent = tblPr.getparent() if is_tag(parent, 'w:style'): self.overrides = {} for tblStylePr in XPath('./w:tblStylePr[@w:type]')(parent): otype = get(tblStylePr, 'w:type') orides = self.overrides[otype] = {} for tblPr in XPath('./w:tblPr')(tblStylePr): orides['table'] = TableStyle(tblPr) for trPr in XPath('./w:trPr')(tblStylePr): orides['row'] = RowStyle(trPr) for tcPr in XPath('./w:tcPr')(tblStylePr): orides['cell'] = CellStyle(tcPr) for pPr in XPath('./w:pPr')(tblStylePr): orides['para'] = ParagraphStyle(pPr) for rPr in XPath('./w:rPr')(tblStylePr): orides['run'] = RunStyle(rPr) self._css = None
def read_letter_spacing(parent, dest): ans = inherit for col in XPath('./w:spacing[@w:val]')(parent): val = simple_float(get(col, 'w:val'), 0.05) if val is not None: ans = val setattr(dest, 'letter_spacing', ans)
def read_sz(parent, dest): ans = inherit for col in XPath("./w:sz[@w:val]")(parent): val = simple_float(get(col, "w:val"), 0.5) if val is not None: ans = val setattr(dest, "font_size", ans)
def read_vert_align(parent, dest): ans = inherit for col in XPath("./w:vertAlign[@w:val]")(parent): val = get(col, "w:val") if val and val in {"baseline", "subscript", "superscript"}: ans = val setattr(dest, "vert_align", ans)
def _read_width(elem): ans = inherit try: w = int(get(elem, 'w:w')) except (TypeError, ValueError): w = 0 typ = get(elem, 'w:type', 'auto') if typ == 'nil': ans = '0' elif typ == 'auto': ans = 'auto' elif typ == 'dxa': ans = '%.3gpt' % (w/20) elif typ == 'pct': ans = '%.3g%%' % (w/50) return ans
def __init__(self, tbl, styles, para_map, is_sub_table=False): self.tbl = tbl self.styles = styles self.is_sub_table = is_sub_table # Read Table Style style = {'table': TableStyle()} for tblPr in XPath('./w:tblPr')(tbl): for ts in XPath('./w:tblStyle[@w:val]')(tblPr): style_id = get(ts, 'w:val') s = styles.get(style_id) if s is not None: if s.table_style is not None: style['table'].update(s.table_style) if s.paragraph_style is not None: if 'paragraph' in style: style['paragraph'].update(s.paragraph_style) else: style['paragraph'] = s.paragraph_style if s.character_style is not None: if 'run' in style: style['run'].update(s.character_style) else: style['run'] = s.character_style style['table'].update(TableStyle(tblPr)) self.table_style, self.paragraph_style = style['table'], style.get( 'paragraph', None) self.run_style = style.get('run', None) self.overrides = self.table_style.overrides if self.overrides is inherit: self.overrides = {} if 'wholeTable' in self.overrides and 'table' in self.overrides[ 'wholeTable']: self.table_style.update(self.overrides['wholeTable']['table']) self.style_map = {} self.paragraphs = [] self.cell_map = [] rows = XPath('./w:tr')(tbl) for r, tr in enumerate(rows): overrides = self.get_overrides(r, None, len(rows), None) self.resolve_row_style(tr, overrides) cells = XPath('./w:tc')(tr) self.cell_map.append([]) for c, tc in enumerate(cells): overrides = self.get_overrides(r, c, len(rows), len(cells)) self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells)) self.cell_map[-1].append(tc) for p in XPath('./w:p')(tc): para_map[p] = self self.paragraphs.append(p) self.resolve_para_style(p, overrides) self.handle_merged_cells() self.sub_tables = { x: Table(x, styles, para_map, is_sub_table=True) for x in XPath('./w:tr/w:tc/w:tbl')(tbl) }
def read_underline(parent, dest): ans = inherit for col in XPath("./w:u[@w:val]")(parent): val = get(col, "w:val") if val: ans = val if val == "none" else "underline" setattr(dest, "text_decoration", ans)
def read_shd(parent, dest): ans = inherit for shd in XPath('./w:shd[@w:fill]')(parent): val = get(shd, 'w:fill') if val: ans = simple_color(val, auto='transparent') setattr(dest, 'background_color', ans)
def read_look(parent, dest): ans = 0 for x in XPath('./w:tblLook')(parent): try: ans = int(get(x, 'w:val'), 16) except (ValueError, TypeError): continue setattr(dest, 'look', ans)
def read_color(parent, dest): ans = inherit for col in XPath('./w:color[@w:val]')(parent): val = get(col, 'w:val') if not val: continue ans = simple_color(val) setattr(dest, 'color', ans)
def read_col_span(parent, dest): ans = inherit for gs in XPath('./w:gridSpan')(parent): try: ans = int(get(gs, 'w:val')) except (TypeError, ValueError): continue setattr(dest, 'col_span', ans)
def read_direction(parent, dest): ans = inherit for jc in XPath('./w:textFlow[@w:val]')(parent): val = get(jc, 'w:val') if not val: continue if 'rl' in val.lower(): ans = 'rtl' setattr(dest, 'direction', ans)