Python w示例，converter.xml_namespaces.docx_ns.w Python示例

示例#1

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

 def handle_p_content(self, e, current_part):
     if e.tag == RUN_TAG:
         return self.handle_run(e)
     elif e.tag == HYPERLINK_TAG:
         internalId = e.attrib.get(ns.r('id'))
         if internalId is None:
             ref = '#' + e.attrib[ns.w('anchor')]
         else:
             rels = self.doc.get_rels_for(current_part)
             ref = rels[internalId].attrib['Target']
         # 'u', 'span' = nuke bogus color and underline
         # styling that google docs likes to add to links;
         # XXX(alexander): rewrite colour less bluntly;
         # this also nukes background color
         handle_p = partial(self.handle_p_content, current_part=current_part)
         body = whack(('u', 'span').__contains__, flatmap(handle_p, e))
         if not body:
             log.warn('hyperlink with no body to: %r', ref)
         return [mkel('a', {'href': ref}, body)]
     elif e.tag == BOOKMARK_END_TAG:
         return []
     elif e.tag == BOOKMARK_START_TAG:
         return [mkel('a', {'name':  e.attrib[ns.w('name')]}, [])]
     elif e.tag == ns.m('oMath'):
         return self.handle_omath(e)
     else:
         log.warn('Ignoring unknown tag %s', e.tag)
         return []

示例#2

0

显示文件

 def handle_p_content(self, e, current_part):
     if e.tag == RUN_TAG:
         return self.handle_run(e)
     elif e.tag == HYPERLINK_TAG:
         internalId = e.attrib.get(ns.r('id'))
         if internalId is None:
             ref = '#' + e.attrib[ns.w('anchor')]
         else:
             rels = self.doc.get_rels_for(current_part)
             ref = rels[internalId].attrib['Target']
         # 'u', 'span' = nuke bogus color and underline
         # styling that google docs likes to add to links;
         # XXX(alexander): rewrite colour less bluntly;
         # this also nukes background color
         handle_p = partial(self.handle_p_content,
                            current_part=current_part)
         body = whack(('u', 'span').__contains__, flatmap(handle_p, e))
         if not body:
             log.warn('hyperlink with no body to: %r', ref)
         return [mkel('a', {'href': ref}, body)]
     elif e.tag == BOOKMARK_END_TAG:
         return []
     elif e.tag == BOOKMARK_START_TAG:
         return [mkel('a', {'name': e.attrib[ns.w('name')]}, [])]
     elif e.tag == ns.m('oMath'):
         return self.handle_omath(e)
     else:
         log.warn('Ignoring unknown tag %s', e.tag)
         return []

示例#3

0

显示文件

 def get_num_style(self, numid, level):
     numid_xpath = self.A_NUMID_XPATH_TEMPL % numid
     abstract_num_id = self.numbering.e.find(numid_xpath).attrib[ns.w('val')]
     lvl_xpath = self.LVL_XPATH_TEMPL % (abstract_num_id, level)
     lvl, = self.numbering.e.iterfind(lvl_xpath)
     numFmt = val(lvl, ns.w('numFmt'))
     lvlText = val(lvl, ns.w('lvlText'))
     return NumStyle(numFmt=numFmt, lvlText=lvlText)

示例#4

0

显示文件

def val(e, child_tag, attrib=ns.w('val')):
    if e is None:
        return None

    for child in e.iter(child_tag):
        return child.attrib.get(attrib)
    return None

示例#5

0

显示文件

 def make_footnote(self, e):
     # pylint: disable=W0622
     id = e.attrib[ns.w('id')]
     ps = (self.doc.get_footnote if e.tag == FOOTNOTE_REFERENCE_TAG else
           self.doc.get_endnote)(id).iterfind(P_TAG)
     footnote_part = 'footnotes'  # XXX what about endnotes
     return mkel('.footnote', {},
                 [self.handle_p(p, current_part=footnote_part) for p in ps])

示例#6

0

显示文件

 def handle_p(self, e, current_part, in_list=False):
     attrs = {}
     pPr = first_of_tag(e, P_PROPS_TAG)
     jc_class = self.JC_TO_CLASS.get(val(pPr, ns.w('jc')))
     if jc_class:
         attrs = add_class(attrs, jc_class)
     tag = style_to_tag(val(pPr, ns.w('pStyle')) or '')
     content = iter(e) if pPr is None else pPr.itersiblings()
     handle_p = partial(self.handle_p_content, current_part=current_part)
     ans = mkel(tag, attrs, flatmap(handle_p, content))
     left_indent = val(pPr, ns.w('ind'), ns.w('left')) or 0.0
     indent = int(round(float(left_indent) / self.default_indent_twips))
     if (not in_list) and indent:
         ans = lift_code(ans)
         ans = mkel('.block', {'indent': indent}, [ans])
         ans = hacky_flatten_block(ans)
     return ans

示例#7

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

 def handle_p(self, e, current_part, in_list=False):
     attrs = {}
     pPr = first_of_tag(e, P_PROPS_TAG)
     jc_class = self.JC_TO_CLASS.get(val(pPr, ns.w('jc')))
     if jc_class:
         attrs = add_class(attrs, jc_class)
     tag = style_to_tag(val(pPr, ns.w('pStyle')) or '')
     content = iter(e) if pPr is None else pPr.itersiblings()
     handle_p = partial(self.handle_p_content, current_part=current_part)
     ans = mkel(tag, attrs, flatmap(handle_p, content))
     left_indent = val(pPr, ns.w('ind'), ns.w('left')) or 0.0
     indent = int(round(float(left_indent) / self.default_indent_twips))
     if (not in_list) and indent:
         ans = lift_code(ans)
         ans = mkel('.block', {'indent': indent}, [ans])
         ans = hacky_flatten_block(ans)
     return ans

示例#8

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

 def make_footnote(self, e):
     # pylint: disable=W0622
     id = e.attrib[ns.w('id')]
     ps = (self.doc.get_footnote if e.tag == FOOTNOTE_REFERENCE_TAG
           else self.doc.get_endnote)(id).iterfind(P_TAG)
     footnote_part = 'footnotes'  # XXX what about endnotes
     return mkel('.footnote', {},
                 [self.handle_p(p, current_part=footnote_part) for p in ps])

示例#9

0

显示文件

    def process(self, e, handle_p):
        numPr = e.find(P_PROPS_TAG + '/' + ns.w('numPr'))
        numid = val(numPr, ns.w('numId'))
        if not numid:
            return self.flush() + [handle_p(e, in_list=False)]

        self.in_list = True  # pylint: disable=W0201
        level = int(val(numPr, ns.w('ilvl')) or 0)
        while level != self.level:
            if level > self.level:
                self.append_points[-1].append([])
                self.append_points.append(self.append_points[-1][-1])
            else:
                self.append_points.pop()
        self.append_points[self.level].append(
            (self.list_type(numid, level), handle_p(e, in_list=True)))
        return []

示例#10

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

    def process(self, e, handle_p):
        numPr = e.find(P_PROPS_TAG + '/' + ns.w('numPr'))
        numid = val(numPr, ns.w('numId'))
        if not numid:
            return self.flush() + [handle_p(e, in_list=False)]

        self.in_list = True  # pylint: disable=W0201
        level = int(val(numPr, ns.w('ilvl')) or 0)
        while level != self.level:
            if level > self.level:
                self.append_points[-1].append([])
                self.append_points.append(self.append_points[-1][-1])
            else:
                self.append_points.pop()
        self.append_points[self.level].append(
            (self.list_type(numid, level),
             handle_p(e, in_list=True)))
        return []

示例#11

0

显示文件

def parse_sectPr(e):  # pylint: disable=C0103
    assert e.tag == ns.w('sectPr')

    d = dict(
        page_width=map(ns.w, ['pgSz', 'w']),
        left_margin=map(ns.w, ['pgMar', 'left']),
        right_margin=map(ns.w, ['pgMar', 'right']),
    )
    return SectPr(**{k: Twips(val(e, *p)) for (k, p) in d.items()})

示例#12

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

    def handle_run(self, r):
        # XXX(ash): pylint is right about this being too complex
        # pylint: disable=R0912
        _ = Var('_')
        ans = []
        rPr = first_of_tag(r, RUN_PROPS_TAG)
        content = rPr.itersiblings() if rPr is not None else iter(r)
        for e in content:
            # pylint: disable=W0622
            type = e.attrib.get(ns.w('type'))
            if e.tag == TEXT_TAG:
                ans.append(e.text)
            elif e.tag == TAB_TAG:
                # XXX(alexander): this can also work like a '_' or '…' \dotfill
                ans.append('\t')
            elif e.tag in (FOOTNOTE_REF_TAG, ENDNOTE_REF_TAG):
                # XXX(ash): what is going on here
                pass
            elif e.tag == BREAK_TAG and type in ('page', 'column'):
                ans.append(mkel('.pagebreak', {}, []))
            elif e.tag == BREAK_TAG or e.tag == CR_TAG:
                assert (type is None) or (type == 'textWrapping')
                ans.append(mkel('br', {}, []))
            # FIXME, tags below untested
            elif e.tag == SOFT_HYPHEN_TAG:
                ans.append(SOFT_HYPHEN)
            elif e.tag == NON_BREAKING_HYPHEN_TAG:
                ans.append(NON_BREAKING_HYPHEN)
            elif e.tag == ns.w('drawing'):
                ans.extend(
                    flatmap(self.transclude, e.xpath(self.IMAGE_XPATH,
                                                     namespaces=ns.dict)))
            elif e.tag in (FOOTNOTE_REFERENCE_TAG, ENDNOTE_REFERENCE_TAG):
                ans.append(self.make_footnote(e))
            else:
                # movie,
                # rt, ruby, rubyAlign etc. for ruby stuff
                # sym, with special handling for wingdings I guess...
                log.warn('Unknown tag %r', e.tag)
        if rPr is not None and ans != Seq[Seq['.footnote', _:], _:]:
            ans = self.apply_rpr(rPr, ans)

        return ans

示例#13

0

显示文件

    def handle_run(self, r):
        # XXX(ash): pylint is right about this being too complex
        # pylint: disable=R0912
        _ = Var('_')
        ans = []
        rPr = first_of_tag(r, RUN_PROPS_TAG)
        content = rPr.itersiblings() if rPr is not None else iter(r)
        for e in content:
            # pylint: disable=W0622
            type = e.attrib.get(ns.w('type'))
            if e.tag == TEXT_TAG:
                ans.append(e.text)
            elif e.tag == TAB_TAG:
                # XXX(alexander): this can also work like a '_' or '…' \dotfill
                ans.append('\t')
            elif e.tag in (FOOTNOTE_REF_TAG, ENDNOTE_REF_TAG):
                # XXX(ash): what is going on here
                pass
            elif e.tag == BREAK_TAG and type in ('page', 'column'):
                ans.append(mkel('.pagebreak', {}, []))
            elif e.tag == BREAK_TAG or e.tag == CR_TAG:
                assert (type is None) or (type == 'textWrapping')
                ans.append(mkel('br', {}, []))
            # FIXME, tags below untested
            elif e.tag == SOFT_HYPHEN_TAG:
                ans.append(SOFT_HYPHEN)
            elif e.tag == NON_BREAKING_HYPHEN_TAG:
                ans.append(NON_BREAKING_HYPHEN)
            elif e.tag == ns.w('drawing'):
                ans.extend(
                    flatmap(self.transclude,
                            e.xpath(self.IMAGE_XPATH, namespaces=ns.dict)))
            elif e.tag in (FOOTNOTE_REFERENCE_TAG, ENDNOTE_REFERENCE_TAG):
                ans.append(self.make_footnote(e))
            else:
                # movie,
                # rt, ruby, rubyAlign etc. for ruby stuff
                # sym, with special handling for wingdings I guess...
                log.warn('Unknown tag %r', e.tag)
        if rPr is not None and ans != Seq[Seq['.footnote', _:], _:]:
            ans = self.apply_rpr(rPr, ans)

        return ans

示例#14

0

显示文件

    def parse_table(self, e, current_part):
        # XXX(ash): simplify
        # pylint: disable=R0914
        def cell_bg(tc):
            if tc[0].tag == TABLE_COLUMN_PROPERTIES_TAG:
                bg = val(tc[0], ns.w('shd'), ns.w('fill'))
                if bg:
                    return add_bg({}, '#' + bg)
            return {}

        def skip_past(e, child):
            if e[0].tag == child:
                return e[0].itersiblings()
            return e.iterchildren()

        def parse_rows(e, has_header_row, has_header_col):
            def is_header(i, j):
                return i == 0 and has_header_row or j == 0 and has_header_col

            return [
                mkel('tr', {}, [
                    mkel(
                        'th' if is_header(i, j) else 'td', cell_bg(tc),
                        self.parse_body(skip_past(tc,
                                                  TABLE_COLUMN_PROPERTIES_TAG),
                                        current_part=current_part))
                    for (j, tc) in enumerate(tr.iterfind(TABLE_COLUMN_TAG))
                ]) for (i, tr) in enumerate(e.iterfind(TABLE_ROW_TAG))
            ]

        tblPr = first_of_tag(e, ns.w('tblPr'))
        tbl_stuff = tblPr.itersiblings()
        tblGrid = next(tbl_stuff)
        # according to the schema this is always true
        assert tblGrid.tag == ns.w('tblGrid'), tblGrid.tag
        look = tblPr.find(ns.w('tblLook'))
        if look is None:
            has_header_row = has_header_col = False
        else:
            # this is actually the canonical check;
            # the identical per cell/row props are just for caching
            has_header_row, has_header_col = (look.attrib.get(k) == "1"
                                              for k in (ns.w('firstRow'),
                                                        ns.w('firstColumn')))

        grid_cols = tblGrid.iterchildren(ns.w('gridCol'))
        col_widths = [int(gc.attrib[ns.w('w')]) for gc in grid_cols]
        col_total = sum(col_widths)
        col_pcts = [100. * w / col_total for w in col_widths]
        cols = [
            mkel('col', add_style({}, 'width', '%s%%' % w), [])
            for w in col_pcts
        ]
        rows = parse_rows(e, has_header_row, has_header_col)
        table = odt_parser.parse_table_body(cols + rows)
        return mkel('table', {}, table)

示例#15

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

    def parse_table(self, e, current_part):
        # XXX(ash): simplify
        # pylint: disable=R0914
        def cell_bg(tc):
            if tc[0].tag == TABLE_COLUMN_PROPERTIES_TAG:
                bg = val(tc[0], ns.w('shd'), ns.w('fill'))
                if bg:
                    return add_bg({}, '#' + bg)
            return {}

        def skip_past(e, child):
            if e[0].tag == child:
                return e[0].itersiblings()
            return e.iterchildren()

        def parse_rows(e, has_header_row, has_header_col):
            def is_header(i, j):
                return i == 0 and has_header_row or j == 0 and has_header_col

            return [
                mkel('tr', {},
                     [mkel('th' if is_header(i, j) else 'td', cell_bg(tc),
                           self.parse_body(
                               skip_past(tc, TABLE_COLUMN_PROPERTIES_TAG),
                               current_part=current_part))
                      for (j, tc) in enumerate(tr.iterfind(TABLE_COLUMN_TAG))])
                for (i, tr) in enumerate(e.iterfind(TABLE_ROW_TAG))]

        tblPr = first_of_tag(e, ns.w('tblPr'))
        tbl_stuff = tblPr.itersiblings()
        tblGrid = next(tbl_stuff)
        # according to the schema this is always true
        assert tblGrid.tag == ns.w('tblGrid'), tblGrid.tag
        look = tblPr.find(ns.w('tblLook'))
        if look is None:
            has_header_row = has_header_col = False
        else:
            # this is actually the canonical check;
            # the identical per cell/row props are just for caching
            has_header_row, has_header_col = (
                look.attrib.get(k) == "1"
                for k in (ns.w('firstRow'), ns.w('firstColumn')))

        grid_cols = tblGrid.iterchildren(ns.w('gridCol'))
        col_widths = [int(gc.attrib[ns.w('w')]) for gc in grid_cols]
        col_total = sum(col_widths)
        col_pcts = [100. * w / col_total for w in col_widths]
        cols = [mkel('col',
                     add_style({}, 'width', '%s%%' % w),
                     []) for w in col_pcts]
        rows = parse_rows(e, has_header_row, has_header_col)
        table = odt_parser.parse_table_body(cols + rows)
        return mkel('table', {}, table)

示例#16

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

 def __init__(self, infilename, make_transclusions):
     self.doc = doc = docxlite.Document(infilename)
     # pylint: disable=W0212
     self.numbering = doc.numbering
     self.body = doc.document.e.find(ns.w('body'))
     self.rels = doc.document.rels
     sprops = docxlite.parse_sectPr(self.body[-1])
     self.textwidth_emu = (sprops.page_width.emu.real
                           - sprops.right_margin.emu.real
                           - sprops.left_margin.emu.real)
     if make_transclusions:
         self.transclusions = make_transclusions(self.doc.get_images())
     else:
         self.transclusions = None
     self.default_indent_twips = 720

示例#17

0

显示文件

 def __init__(self, infilename, make_transclusions):
     self.doc = doc = docxlite.Document(infilename)
     # pylint: disable=W0212
     self.numbering = doc.numbering
     self.body = doc.document.e.find(ns.w('body'))
     self.rels = doc.document.rels
     sprops = docxlite.parse_sectPr(self.body[-1])
     self.textwidth_emu = (sprops.page_width.emu.real -
                           sprops.right_margin.emu.real -
                           sprops.left_margin.emu.real)
     if make_transclusions:
         self.transclusions = make_transclusions(self.doc.get_images())
     else:
         self.transclusions = None
     self.default_indent_twips = 720

示例#18

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

    def apply_rpr(self, rPr, ans):
        stys = {x.tag for x in rPr.iterchildren(*self.STYLE_TO_HTML)}
        if stys:
            for (t, html) in self.STYLE_TO_HTML.iteritems():
                if t in stys:
                    ans = [mkel(html, {}, ans)]
        color = val(rPr, ns.w('color'))
        if color:
            a = add_style({}, 'color', '#' + color)
            ans = [mkel('span', a, ans)]  # FIXME word colors

        # `None` here == turn highlighting off; it's different from no value
        highlight = self.HIGHLIGHT_TO_RGB.get(val(rPr, ns.w('highlight')),
                                              False)
        if highlight is False: # higher precedence than shade
            highlight = val(rPr, ns.w('shd'), ns.w('fill'))
        if highlight:
            ans = [mkel('span', add_bg({}, '#' + highlight), ans)]
        vertalign = val(rPr, ns.w('vertAlign'))
        if vertalign and vertalign != 'baseline':
            ans = [mkel(vertalign[:3], {}, ans)]
        if is_code_font(val(rPr, ns.w('rFonts'), ns.w('ascii'))):
            ans = [mkel('code', {}, ans)]
        return ans

示例#19

0

显示文件

    def apply_rpr(self, rPr, ans):
        stys = {x.tag for x in rPr.iterchildren(*self.STYLE_TO_HTML)}
        if stys:
            for (t, html) in self.STYLE_TO_HTML.iteritems():
                if t in stys:
                    ans = [mkel(html, {}, ans)]
        color = val(rPr, ns.w('color'))
        if color:
            a = add_style({}, 'color', '#' + color)
            ans = [mkel('span', a, ans)]  # FIXME word colors

        # `None` here == turn highlighting off; it's different from no value
        highlight = self.HIGHLIGHT_TO_RGB.get(val(rPr, ns.w('highlight')),
                                              False)
        if highlight is False:  # higher precedence than shade
            highlight = val(rPr, ns.w('shd'), ns.w('fill'))
        if highlight:
            ans = [mkel('span', add_bg({}, '#' + highlight), ans)]
        vertalign = val(rPr, ns.w('vertAlign'))
        if vertalign and vertalign != 'baseline':
            ans = [mkel(vertalign[:3], {}, ans)]
        if is_code_font(val(rPr, ns.w('rFonts'), ns.w('ascii'))):
            ans = [mkel('code', {}, ans)]
        return ans

示例#20

0

显示文件

class Document(object):
    LVL_XPATH_TEMPL = ('./w:abstractNum[@w:abstractNumId="%s"]/'
                       'w:lvl[@w:ilvl="%s"]'.replace('w:', ns.w('')))
    A_NUMID_XPATH_TEMPL = ('./w:num[@w:numId="%s"]/w:abstractNumId'
                           .replace('w:', ns.w('')))
    def __init__(self, path_or_file):
        self.z = read_zip(path_or_file)
        self.document = get_part(self.z, MAGIC_WORD + '/document.xml')
        self.numbering = get_part(self.z, MAGIC_WORD + '/numbering.xml')
        self.footnotes = get_part(self.z, MAGIC_WORD + '/footnotes.xml')
        self.endnotes = get_part(self.z, MAGIC_WORD + '/endnotes.xml')

    @staticmethod
    def _get_by_id(id, xs):  # pylint: disable=W0622
        return next(x for x in xs if id == x.attrib[ns.w('id')])

    def get_footnote(self, id):   # pylint: disable=W0622
        return self._get_by_id(id, self.footnotes.e)

    def get_endnote(self, id):  # pylint: disable=W0622
        return self._get_by_id(id, self.endnotes.e)

    def get_num_style(self, numid, level):
        numid_xpath = self.A_NUMID_XPATH_TEMPL % numid
        abstract_num_id = self.numbering.e.find(numid_xpath).attrib[ns.w('val')]
        lvl_xpath = self.LVL_XPATH_TEMPL % (abstract_num_id, level)
        lvl, = self.numbering.e.iterfind(lvl_xpath)
        numFmt = val(lvl, ns.w('numFmt'))
        lvlText = val(lvl, ns.w('lvlText'))
        return NumStyle(numFmt=numFmt, lvlText=lvlText)


    def get_or_add_extn(self, mime_type):
        path = '[Content_Types].xml'
        e = to_etree(self.z.get(path))
        ctns = 'http://schemas.openxmlformats.org/package/2006/content-types'
        default = e.xpath('./ct:Default[@ContentType="%s"]/@Extension' % mime_type,  # pylint: disable=C0301
                          namespaces={'ct': ctns})
        if len(default) == 1:
            return default[0]
        extn = guess_extension(mime_type)
        default = e.xpath('./ct:Default[@Extension="%s"]' % extn,
                          namespaces={'ct': ctns})
        assert len(default) == 0  # XXX(ash): handle this case more gracefully
        tup = ('Default', {'ContentType': mime_type, 'Extension': extn}, [])
        e[:0] = [tup2etree(tup, {None: ctns})]
        self.z[path] = etree2s(e)
        return extn


    def add_image(self, f, mime_type):
        extn = self.get_or_add_extn(mime_type)

        img = f.read()

        p = fresh_name(set(self.z.keys()),
                       MAGIC_WORD + '/media/image%d.' + extn)
        self.z[p] = img

        # XXX(ash): why do we have to relativize the targets
        rel_p = p.split('/', 1)[1]
        rid = fresh_name(set(self.document.rels), 'rId%d')
        self.document.rels[rid] = tup2etree(
            ('Relationship', {'Target': rel_p,
                              'Type': IMAGE_REL_URI,
                              'Id': rid}, []))

        return rid

    def get_images(self):
        # XXX(ash): what about the rels of the other parts...
        images = dict((id, r.attrib['Target'])
                      for (id, r) in self.document.rels.iteritems()
                      if r.attrib['Type'] == IMAGE_REL_URI)
        includes = dict((id, StringIO(self.z[MAGIC_WORD + '/' + fn]))
                        for id, fn in images.items())
        return includes

    def get_rels_for(self, part):
        return getattr(self, part).rels

    def save(self, f):
        replacements = {}
        for name in ['document', 'numbering', 'footnotes', 'endnotes']:
            part = getattr(self, name)
            if part.e is not None:
                replacements[part.path] = etree2s(part.e, decl=True)
            if part.rels:
                replacements[part.rels_path] = rels2s(part.rels)
        with ZipFile(f, 'w') as outz:
            for filename, contents in self.z.items():
                replacement = replacements.get(filename, None)
                outz.writestr(
                    filename, replacement if replacement else contents)

示例#21

0

显示文件

 def _get_by_id(id, xs):  # pylint: disable=W0622
     return next(x for x in xs if id == x.attrib[ns.w('id')])

示例#22

0

显示文件

 def cell_bg(tc):
     if tc[0].tag == TABLE_COLUMN_PROPERTIES_TAG:
         bg = val(tc[0], ns.w('shd'), ns.w('fill'))
         if bg:
             return add_bg({}, '#' + bg)
     return {}

示例#23

0

显示文件

文件： docx_parser.py 项目： hybrid-publishing-lab/typesetr-academic

 def cell_bg(tc):
     if tc[0].tag == TABLE_COLUMN_PROPERTIES_TAG:
         bg = val(tc[0], ns.w('shd'), ns.w('fill'))
         if bg:
             return add_bg({}, '#' + bg)
     return {}

示例#24

0

显示文件

class Docx(object):
    # get "normal" inline images (i.e. ignnores VML and similar crap)
    IMAGE_XPATH = (
        './*[self::wp:inline|self::wp:anchor]'
        '[.//a:graphicData'
        '[@uri="http://schemas.openxmlformats.org/drawingml/2006/picture"]]')
    JC_TO_CLASS = {
        'left': 'left',
        'right': 'right',
        'center': 'center',
        'both': 'justify',
    }
    STYLE_TO_HTML = OrderedDict(
        (ns.w(x), x[0]) for x in ['u', 'b', 'bCs', 'i', 'iCs', 'strike'])

    HIGHLIGHT_TO_RGB = {
        'black': '000000',
        'blue': '0000ff',
        'cyan': '00ffff',
        'darkBlue': '000080',
        'darkCyan': '008080',
        'darkGray': '808080',
        'darkGreen': '008000',
        'darkMagenta': '800080',
        'darkRed': '800000',
        'darkYellow': '808000',
        'green': '00ff00',
        'lightGray': 'c0c0c0',
        'magenta': 'ff00ff',
        'red': 'ff0000',
        'white': 'ffffff',
        'yellow': 'ffff00',
        'none': None,
    }

    def __init__(self, infilename, make_transclusions):
        self.doc = doc = docxlite.Document(infilename)
        # pylint: disable=W0212
        self.numbering = doc.numbering
        self.body = doc.document.e.find(ns.w('body'))
        self.rels = doc.document.rels
        sprops = docxlite.parse_sectPr(self.body[-1])
        self.textwidth_emu = (sprops.page_width.emu.real -
                              sprops.right_margin.emu.real -
                              sprops.left_margin.emu.real)
        if make_transclusions:
            self.transclusions = make_transclusions(self.doc.get_images())
        else:
            self.transclusions = None
        self.default_indent_twips = 720

    def parse(self):
        return (self.parse_body(self.body,
                                current_part='document'), self.transclusions)

    def handle_omath(self, e):  # pylint: disable=W0613
        return []

    def handle_p_content(self, e, current_part):
        if e.tag == RUN_TAG:
            return self.handle_run(e)
        elif e.tag == HYPERLINK_TAG:
            internalId = e.attrib.get(ns.r('id'))
            if internalId is None:
                ref = '#' + e.attrib[ns.w('anchor')]
            else:
                rels = self.doc.get_rels_for(current_part)
                ref = rels[internalId].attrib['Target']
            # 'u', 'span' = nuke bogus color and underline
            # styling that google docs likes to add to links;
            # XXX(alexander): rewrite colour less bluntly;
            # this also nukes background color
            handle_p = partial(self.handle_p_content,
                               current_part=current_part)
            body = whack(('u', 'span').__contains__, flatmap(handle_p, e))
            if not body:
                log.warn('hyperlink with no body to: %r', ref)
            return [mkel('a', {'href': ref}, body)]
        elif e.tag == BOOKMARK_END_TAG:
            return []
        elif e.tag == BOOKMARK_START_TAG:
            return [mkel('a', {'name': e.attrib[ns.w('name')]}, [])]
        elif e.tag == ns.m('oMath'):
            return self.handle_omath(e)
        else:
            log.warn('Ignoring unknown tag %s', e.tag)
            return []

    def transclude(self, pic):
        # for id:
        # pylint: disable=W0622

        if self.transclusions is None:
            return []

        width_emu = float(val(pic, ns.wp('extent'), 'cx'))
        embeds = pic.xpath('.//a:blip/@r:embed', namespaces=ns.dict)
        try:
            id, = embeds
        except ValueError:
            log.warn('Expected exactly one r:embed with an image id, got %r',
                     embeds)
            return []

        href = self.transclusions.normalize_known_transclusion(id)
        return [
            make_figure(relwidth=width_emu / self.textwidth_emu,
                        inline={
                            'anchor': False,
                            'inline': True
                        }[pic.tag.split('}')[1]],
                        body=[mkel('img', {'src': href}, [])],
                        src=href,
                        original_href=id)
        ]

    def make_footnote(self, e):
        # pylint: disable=W0622
        id = e.attrib[ns.w('id')]
        ps = (self.doc.get_footnote if e.tag == FOOTNOTE_REFERENCE_TAG else
              self.doc.get_endnote)(id).iterfind(P_TAG)
        footnote_part = 'footnotes'  # XXX what about endnotes
        return mkel('.footnote', {},
                    [self.handle_p(p, current_part=footnote_part) for p in ps])

    def handle_run(self, r):
        # XXX(ash): pylint is right about this being too complex
        # pylint: disable=R0912
        _ = Var('_')
        ans = []
        rPr = first_of_tag(r, RUN_PROPS_TAG)
        content = rPr.itersiblings() if rPr is not None else iter(r)
        for e in content:
            # pylint: disable=W0622
            type = e.attrib.get(ns.w('type'))
            if e.tag == TEXT_TAG:
                ans.append(e.text)
            elif e.tag == TAB_TAG:
                # XXX(alexander): this can also work like a '_' or '…' \dotfill
                ans.append('\t')
            elif e.tag in (FOOTNOTE_REF_TAG, ENDNOTE_REF_TAG):
                # XXX(ash): what is going on here
                pass
            elif e.tag == BREAK_TAG and type in ('page', 'column'):
                ans.append(mkel('.pagebreak', {}, []))
            elif e.tag == BREAK_TAG or e.tag == CR_TAG:
                assert (type is None) or (type == 'textWrapping')
                ans.append(mkel('br', {}, []))
            # FIXME, tags below untested
            elif e.tag == SOFT_HYPHEN_TAG:
                ans.append(SOFT_HYPHEN)
            elif e.tag == NON_BREAKING_HYPHEN_TAG:
                ans.append(NON_BREAKING_HYPHEN)
            elif e.tag == ns.w('drawing'):
                ans.extend(
                    flatmap(self.transclude,
                            e.xpath(self.IMAGE_XPATH, namespaces=ns.dict)))
            elif e.tag in (FOOTNOTE_REFERENCE_TAG, ENDNOTE_REFERENCE_TAG):
                ans.append(self.make_footnote(e))
            else:
                # movie,
                # rt, ruby, rubyAlign etc. for ruby stuff
                # sym, with special handling for wingdings I guess...
                log.warn('Unknown tag %r', e.tag)
        if rPr is not None and ans != Seq[Seq['.footnote', _:], _:]:
            ans = self.apply_rpr(rPr, ans)

        return ans

    def apply_rpr(self, rPr, ans):
        stys = {x.tag for x in rPr.iterchildren(*self.STYLE_TO_HTML)}
        if stys:
            for (t, html) in self.STYLE_TO_HTML.iteritems():
                if t in stys:
                    ans = [mkel(html, {}, ans)]
        color = val(rPr, ns.w('color'))
        if color:
            a = add_style({}, 'color', '#' + color)
            ans = [mkel('span', a, ans)]  # FIXME word colors

        # `None` here == turn highlighting off; it's different from no value
        highlight = self.HIGHLIGHT_TO_RGB.get(val(rPr, ns.w('highlight')),
                                              False)
        if highlight is False:  # higher precedence than shade
            highlight = val(rPr, ns.w('shd'), ns.w('fill'))
        if highlight:
            ans = [mkel('span', add_bg({}, '#' + highlight), ans)]
        vertalign = val(rPr, ns.w('vertAlign'))
        if vertalign and vertalign != 'baseline':
            ans = [mkel(vertalign[:3], {}, ans)]
        if is_code_font(val(rPr, ns.w('rFonts'), ns.w('ascii'))):
            ans = [mkel('code', {}, ans)]
        return ans

    def handle_p(self, e, current_part, in_list=False):
        attrs = {}
        pPr = first_of_tag(e, P_PROPS_TAG)
        jc_class = self.JC_TO_CLASS.get(val(pPr, ns.w('jc')))
        if jc_class:
            attrs = add_class(attrs, jc_class)
        tag = style_to_tag(val(pPr, ns.w('pStyle')) or '')
        content = iter(e) if pPr is None else pPr.itersiblings()
        handle_p = partial(self.handle_p_content, current_part=current_part)
        ans = mkel(tag, attrs, flatmap(handle_p, content))
        left_indent = val(pPr, ns.w('ind'), ns.w('left')) or 0.0
        indent = int(round(float(left_indent) / self.default_indent_twips))
        if (not in_list) and indent:
            ans = lift_code(ans)
            ans = mkel('.block', {'indent': indent}, [ans])
            ans = hacky_flatten_block(ans)
        return ans

    def parse_body(self, xml, current_part):
        builder = ListBuilder(self.doc)
        body = []

        for e in xml:
            if e.tag == P_TAG:
                handle_p = partial(self.handle_p, current_part=current_part)
                body.extend(builder.process(e, handle_p))
            else:
                body.extend(builder.flush())
                if e.tag == TABLE_TAG:
                    body.append(self.parse_table(e, current_part))
                elif e.tag == SECTION_PROPERTIES_TAG:
                    pass
                else:
                    log.warn('Unrecognized element: %s', e.tag)

        body.extend(builder.flush())
        return body

    def parse_table(self, e, current_part):
        # XXX(ash): simplify
        # pylint: disable=R0914
        def cell_bg(tc):
            if tc[0].tag == TABLE_COLUMN_PROPERTIES_TAG:
                bg = val(tc[0], ns.w('shd'), ns.w('fill'))
                if bg:
                    return add_bg({}, '#' + bg)
            return {}

        def skip_past(e, child):
            if e[0].tag == child:
                return e[0].itersiblings()
            return e.iterchildren()

        def parse_rows(e, has_header_row, has_header_col):
            def is_header(i, j):
                return i == 0 and has_header_row or j == 0 and has_header_col

            return [
                mkel('tr', {}, [
                    mkel(
                        'th' if is_header(i, j) else 'td', cell_bg(tc),
                        self.parse_body(skip_past(tc,
                                                  TABLE_COLUMN_PROPERTIES_TAG),
                                        current_part=current_part))
                    for (j, tc) in enumerate(tr.iterfind(TABLE_COLUMN_TAG))
                ]) for (i, tr) in enumerate(e.iterfind(TABLE_ROW_TAG))
            ]

        tblPr = first_of_tag(e, ns.w('tblPr'))
        tbl_stuff = tblPr.itersiblings()
        tblGrid = next(tbl_stuff)
        # according to the schema this is always true
        assert tblGrid.tag == ns.w('tblGrid'), tblGrid.tag
        look = tblPr.find(ns.w('tblLook'))
        if look is None:
            has_header_row = has_header_col = False
        else:
            # this is actually the canonical check;
            # the identical per cell/row props are just for caching
            has_header_row, has_header_col = (look.attrib.get(k) == "1"
                                              for k in (ns.w('firstRow'),
                                                        ns.w('firstColumn')))

        grid_cols = tblGrid.iterchildren(ns.w('gridCol'))
        col_widths = [int(gc.attrib[ns.w('w')]) for gc in grid_cols]
        col_total = sum(col_widths)
        col_pcts = [100. * w / col_total for w in col_widths]
        cols = [
            mkel('col', add_style({}, 'width', '%s%%' % w), [])
            for w in col_pcts
        ]
        rows = parse_rows(e, has_header_row, has_header_col)
        table = odt_parser.parse_table_body(cols + rows)
        return mkel('table', {}, table)

    def strip_meta(self, unaugmented_meta, transclusions, asides):
        # XXX(ash): :(
        from converter.postprocess import postprocess
        for i in range(len(self.body)):
            raw_body_i = self.parse_body(self.body[:i],
                                         current_part='document')
            unaugmented_meta_i = postprocess(raw_body_i,
                                             transclusions,
                                             asides=asides)[0]
            if unaugmented_meta_i == unaugmented_meta:
                self.body[:i] = []
                return
        raise Exception('failed to find the end of the metadata')

    @staticmethod
    def meta_to_docx(meta, intern_image, total_w):
        tups = []

        meta_copy = meta.raw_items().copy()

        to_runs = partial(meta_to_runs,
                          intern_image=intern_image,
                          total_w=total_w)

        for name in ['Title', 'Subtitle']:
            bit = meta_copy.pop(name.lower(), None)
            if bit:
                pr = mkel(
                    'w:pPr',
                    {},
                    [
                        # FIXME(ash): currently we don't ensure the styles exist
                        mkel('w:pStyle', {'w:val': name}, [])
                    ])
                tups.append(make_p(pr, *to_runs(bit)))

        for k, v in meta_copy.iteritems():
            body = (to_runs([mkel('u', {}, [str(k) + ':']), ' ']) + to_runs(v))
            tups.append(make_p(*body))

        return [tup2etree(tup, nsmap=ns.dict) for tup in tups]

    def intern_image(self, image):
        assert isinstance(image, literal.Image)
        f = StringIO(image.data)
        rid = self.doc.add_image(f, image.mimetype)
        return rid

    def insert_meta(self, meta):
        self.body[:0] = self.meta_to_docx(meta, self.intern_image,
                                          self.textwidth_emu)

    def save_to(self, f):
        self.doc.save(f)