def parse_table_body(body): #pylint: disable=R0914
    def extract_header(elems):
        attrs = []
        for elem in elems:
            _TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY = map(
                "_TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY".split(', '))
            _BLANK_BODY = Var('_BLANK', blank)
            if elem == (_TAG, TATTRS, _BLANK_BODY):
                # empty cell - accept, but do not propagate attrs, apart
                # from background-color
                bg = TATTRS.val.get('style', {}).get('background-color')
                if bg:
                    iadd_style(attrs[-1], 'background-color', bg)
            elif elem in ((_TAG, TATTRS, [('p', PATTRS,
                                           [('span', _SATTRS,
                                             [('b', _BATTRS, TBODY)])])]),
                          (_TAG, TATTRS, [('p', PATTRS,
                                           [('b', _BATTRS, TBODY)])])):
                attrs.append(merge_attrs(TATTRS.val, PATTRS.val))
                #not header set
                return False, []

        return True, attrs

    cols = [el for el in body if el[0] == 'col']
    trs = [el for el in body if el[0] == 'tr']

    has_header_row, header_attrs = extract_header(trs[0][2]) #rows[0].body
    if has_header_row:
        header_row = []
        ncols = []
        for index, td in enumerate(trs[0][2]):
            ctag, cattrs, cbody = cols[index]
            header_row.append(mkel('th', header_attrs[index], td[2]))
            if 'class' in header_attrs[index]:
                cattrs = add_class(cattrs, *header_attrs[index]['class'])
            ncols.append(mkel(ctag, cattrs, cbody))
        trs = [mkel('tr', {}, header_row)] + trs[1:]
        cols = ncols

    has_header_column, _col_attrs = extract_header(
        [body[0] for (_, _, body) in trs])
    if has_header_column:
        ntrs = []
        for (trtag, trattrs, trbody) in trs:
            tdtag, tdattrs, tdbody = trbody[0]
            ntd = mkel(tdtag, add_class(tdattrs, 'headcol'), tdbody)
            ntrs.append(mkel(trtag, trattrs, [ntd] + trbody[1:]))
        trs = ntrs
    return [mkel('colgroup', {}, cols)] + trs
예제 #2
def parse_table_body(body):  #pylint: disable=R0914
    def extract_header(elems):
        attrs = []
        for elem in elems:
            _TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY = map(
                "_TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY".split(', '))
            _BLANK_BODY = Var('_BLANK', blank)
            if elem == (_TAG, TATTRS, _BLANK_BODY):
                # empty cell - accept, but do not propagate attrs, apart
                # from background-color
                bg = TATTRS.val.get('style', {}).get('background-color')
                if bg:
                    iadd_style(attrs[-1], 'background-color', bg)
            elif elem in ((_TAG, TATTRS, [
                ('p', PATTRS, [('span', _SATTRS, [('b', _BATTRS, TBODY)])])
            ]), (_TAG, TATTRS, [('p', PATTRS, [('b', _BATTRS, TBODY)])])):
                attrs.append(merge_attrs(TATTRS.val, PATTRS.val))
                #not header set
                return False, []

        return True, attrs

    cols = [el for el in body if el[0] == 'col']
    trs = [el for el in body if el[0] == 'tr']

    has_header_row, header_attrs = extract_header(trs[0][2])  #rows[0].body
    if has_header_row:
        header_row = []
        ncols = []
        for index, td in enumerate(trs[0][2]):
            ctag, cattrs, cbody = cols[index]
            header_row.append(mkel('th', header_attrs[index], td[2]))
            if 'class' in header_attrs[index]:
                cattrs = add_class(cattrs, *header_attrs[index]['class'])
            ncols.append(mkel(ctag, cattrs, cbody))
        trs = [mkel('tr', {}, header_row)] + trs[1:]
        cols = ncols

    has_header_column, _col_attrs = extract_header(
        [body[0] for (_, _, body) in trs])
    if has_header_column:
        ntrs = []
        for (trtag, trattrs, trbody) in trs:
            tdtag, tdattrs, tdbody = trbody[0]
            ntd = mkel(tdtag, add_class(tdattrs, 'headcol'), tdbody)
            ntrs.append(mkel(trtag, trattrs, [ntd] + trbody[1:]))
        trs = ntrs
    return [mkel('colgroup', {}, cols)] + trs
예제 #3
def _propagate_alignment(content, cols):
    trs = [el for el in content if el[0] == 'tr']
    for _, _, tds in trs:
        assert len(tds) == len(cols), \
            "Table row has not enough cells: %s" % tds
        for cid, col in enumerate(cols):
            if 'class' in col[1]:
                attrs = tds[cid][1]
                # FIXME ugly hack
                attrs.update(add_class(attrs, *col[1]['class']))
def _propagate_alignment(content, cols):
    trs = [el for el in content if el[0] == 'tr']
    for _, _, tds in trs:
        assert len(tds) == len(cols), \
            "Table row has not enough cells: %s" % tds
        for cid, col in enumerate(cols):
            if 'class' in col[1]:
                attrs = tds[cid][1]
                # FIXME ugly hack
                attrs.update(add_class(attrs, *col[1]['class']))
예제 #5
 def handle_p(self, e, current_part, in_list=False):
     attrs = {}
     pPr = first_of_tag(e, P_PROPS_TAG)
     jc_class = self.JC_TO_CLASS.get(val(pPr, ns.w('jc')))
     if jc_class:
         attrs = add_class(attrs, jc_class)
     tag = style_to_tag(val(pPr, ns.w('pStyle')) or '')
     content = iter(e) if pPr is None else pPr.itersiblings()
     handle_p = partial(self.handle_p_content, current_part=current_part)
     ans = mkel(tag, attrs, flatmap(handle_p, content))
     left_indent = val(pPr, ns.w('ind'), ns.w('left')) or 0.0
     indent = int(round(float(left_indent) / self.default_indent_twips))
     if (not in_list) and indent:
         ans = lift_code(ans)
         ans = mkel('.block', {'indent': indent}, [ans])
         ans = hacky_flatten_block(ans)
     return ans
 def handle_p(self, e, current_part, in_list=False):
     attrs = {}
     pPr = first_of_tag(e, P_PROPS_TAG)
     jc_class = self.JC_TO_CLASS.get(val(pPr, ns.w('jc')))
     if jc_class:
         attrs = add_class(attrs, jc_class)
     tag = style_to_tag(val(pPr, ns.w('pStyle')) or '')
     content = iter(e) if pPr is None else pPr.itersiblings()
     handle_p = partial(self.handle_p_content, current_part=current_part)
     ans = mkel(tag, attrs, flatmap(handle_p, content))
     left_indent = val(pPr, ns.w('ind'), ns.w('left')) or 0.0
     indent = int(round(float(left_indent) / self.default_indent_twips))
     if (not in_list) and indent:
         ans = lift_code(ans)
         ans = mkel('.block', {'indent': indent}, [ans])
         ans = hacky_flatten_block(ans)
     return ans
예제 #7
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean,
    # pylint: disable=R0911,R0914,R0912,R0913,R0915
    # FIXME(alexander): clean this up a bit, and get rid of pylint muffles
    if isinstance(fragment, basestring):
        return cgi.escape(fragment)

    (tag, attrs, content) = fragment
    if tag in ['script', 'style'] and content:
        content_str, = content
        return NOT_INLINE_TEMPLATE % dict(
            attrs_str=encode_attrs(attrs, transclusions, epub_clean),
                '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')),
    if tag == 'pre':
        return '\n' + highlight.as_html(fragment)

    # special case figures and tables
    if tag == 'figure':
        style = attrs['style'].copy()
        width = style.pop('width', '100%')
        attrs = dict(attrs.items(), style=style)
        # FIXME(alexander): dirty hacks to fixup caption & width
        img = content[-1]
        assert img[0] == 'img'
        img[1].setdefault('style', OrderedDict())['width'] = width
        # put figcaption towards end
        if content[0][0] == 'figcaption':
            content[0], content[-1] = content[-1], content[0]
        if style['display'] == 'inline':
            ATTRS = Var('ATTRS')  # pylint: disable=C0103
            assert content[:1] == [('img', ATTRS, [])], \
                "figure does not begin with an img"
            attrs = add_class(ATTRS.val, 'margin')
            # peel of the figure tag for inlined stuff
            # as a hack to make epub/html validate
            # (figures can't occur in all contexts imgs can)
            return handle_fragments([('img', attrs, [])],
    elif tag == 'table':
        colgroups = [el for el in content if el[0] == 'colgroup']
        COLS = Var("COLS")  # pylint: disable=C0103
        assert colgroups == [('colgroup', {}, COLS)], \
                "Expected single colgroup in table %s" % content
        # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away
        # imperatively propagate table cell alignment down
        # this is a pretty horrible hack and would blow
        # up nastily if there is attribute aliasing,
        # but deepcopying should kinda make it work
        content = copy.deepcopy(content)
        _propagate_alignment(content, COLS.val)

    elif tag == 'col':
        if not epub_clean:
            attrs = attrs.copy()
            attrs['width'] = attrs['style']['width']
            del attrs['style']
        # cull
        ## return handle_fragments(content, indent)
    # FIXME(alexander): might make more sense to filter (or h-ify) these out
    # elsewhere, but for now this seems not unreasonable
    elif tag == 'title':
        tag = 'h1'
        attrs = add_class(attrs, 'title')
    elif tag == 'subtitle':
        tag = 'h2'
        attrs = add_class(attrs, 'subtitle')
    elif tag in ('CMD', 'LIT'):
        bad_command = None
        cmd_type, = attrs['class']
        # FIXME(alexander): convert tex to html for non-math;
        # convert tex math to MML for epub
        if cmd_type in ('$', 'tex'):
            tex, = content
            if cmd_type == '$':
                tex = r'\(%s\)' % tex
            return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex)
        elif CITE_REX.match(cmd_type):
            if bibliography:
                # post = ('[%s]' % content[1] if len(content) > 1 and content[1]
                #         else '')
                # Post is ignored for the moment
                return _format_citation(cmd_type, content[0], bibliography)
                    'Citation exists, but bibliography is missing')
            bad_command = cmd_type + (':' if content else '')
            docerror.docproblem('Unknown command type:%s' % cmd_type)
    elif epub_clean:
        if tag == 'a' and 'name' in attrs:
            assert len(attrs) == 1
            attrs = {'id': attrs['name']}
        elif tag == 'img':
            attrs = {
                k: attrs[k]
                for k in attrs if k not in ('width', 'height')

    # FIXME(alexander): support continued-list properly in html, by keeping
    # track of numbers of items per list-id and translating it to start

    if tag in H_TAGS:
        if h_shift:
            tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift))

    # generic [tagname].class tags
    if '.' in tag:
        if tag == '.pagebreak':
            tag = 'div.pagebreak'  # for whitespace sanitization
        tagname, classname = tag.split('.', 1)
        tag = tagname or 'span'
        attrs = add_class(attrs, classname)

    if tag == 'CMD' and bad_command:
        tag = 'span'
        attrs = {'class': ['bad-command']}
        content = [('u', {}, [bad_command])] + content
    elif tag == 'ERR':
        tag = 'span'
        attrs = {'class': ['err'], 'title': attrs['info'][0]}

    content_str = handle_fragments(content,
                                   indent='  ' + indent,
    if tag in VOID_TAGS:
        assert not content
        template = "<%(tag)s%(attrs_str)s/>"
    elif tag in INLINE:
        template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>"
    elif '\n' in content_str:
        template = NOT_INLINE_TEMPLATE

    # FIXME(alexander): disgusting hack; fix this properly and
    # use a set representation to start with!
    classes = attrs.get('class')
    if classes:
        attrs = attrs.copy()
        attrs['class'] = sorted(set(classes))

    return template % dict(indent=indent,
                           attrs_str=encode_attrs(attrs, transclusions,
예제 #8
def parse_body(xml, context, normalize_transclusion):
    # pylint: disable=R0912,R0915,R0914
    for e in xml:
        text = (e.text or '')
        tail = (e.tail or '')

        # some style properties should be promoted to tags, e.g. underlining
        # and bolding
        tags_from_style = []
        stys_dealt_with = []

        if e.tag in (S_TAG, TAB_TAG):
            yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(
                ns.text('c'), '1'))
            if tail:
                yield tail

        if e.tag == LINEBREAK_TAG:
            yield mkel('br', {}, [])

        sty = context.stys.get(
            e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR))
        # handle page breaks
        if sty and sty.par_break:
            assert e.tag in (H_TAG, P_TAG), \
                   "Unexpected page-break in %r" % e.tag
            yield mkel('.pagebreak', {}, [])
        # Handle lists specially
        if e.tag == LIST_TAG:
            new_context = context.bump_list_level(sty)
            new_context = context
        body = list(parse_body(e, new_context, normalize_transclusion))
        assert type(body) is list and not body or type(body[0]) is not list
        attrs = {}
        if text:
            body = [text] + body
        if sty and sty.type.endswith('title'):
            head = sty.type
            body = [plaintextify(body)]
            sty = None
        elif e.tag == H_TAG:
            # skip empty headings; NB: this *must* happen
            # after we extracted eventual page-breaks, which are the only
            # useful information empty headings can contain
            if blank(body):
            head = sty.type
            # FIXME(alexander): keep track of the headings breadcrumbs in
            # context for two reasons
            #  1. to associate errors with specific headings
            #  2. to warn about bad structure e.g. h1 followed by h4,
            #     rather than h2
        elif e.tag == LIST_TAG:
            head = new_context.list_type
            assert head in ('ol', 'ul')
            list_start = new_context.list_start
            if list_start is not None:
                assert head == 'ol'
                attrs['start'] = str(list_start)

            id_ = e.attrib.get(ns.xml('id'))  # pylint: disable=E1101
            if id_ is not None:
                attrs['id'] = id_
            continues = e.attrib.get(ns.text('continue-list'))
            if continues is not None:
                # make this a data attrib, so we can stuff it
                # into the html, which doesn't have direct support
                attrs['data-continue-list'] = continues

        elif e.tag == LIST_ITEM_TAG:
            head = 'li'
        elif e.tag == ANNOTATION_TAG:
            head = 'aside'
            #FIXME: extract content
            if text:
                log.warning('Hey, someone actually specified a %s: %s', e.tag,
            if tail:
                yield tail
        elif e.tag == NOTE_TAG:
            # other valid option is 'endnote'
            assert e.attrib[ns.text('note-class')] == 'footnote'
            # skip ahead and exit early; we only represent the note-body
            assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG
            assert len(body) == 1
            yield body[0]
            if tail:
                yield tail
        elif e.tag == NOTE_BODY_TAG:
            head = '.footnote'
            # FIXME(alexander): sucky hack to strip the bogus whitespace
            # google docs enters at the beginning of a footnote for some
            # reason. I should really write a more generic whitespace
            # stripping mechanism in the postprocess module that can recognize
            # consecutive whitespace even if seperated-by/wrapped-in inline
            # tags.
            _, B1, B2, = map(Var, '_, B1, B2'.split(', '))
            SPACED_STR = Var(
                'SPACED_STR', lambda s:
                (isinstance(s, basestring) and re.match(r'\s+', s)))
            if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]:
                body[0][2][0] = SPACED_STR.val.lstrip()
        # FIXME(alexander): add anchors for all paras
        elif e.tag == P_TAG:
            margin = sty.margin_left or sty.text_indent if sty else None
            indent_level = in_indents(margin) if margin else 0
            if indent_level:
                head = '.block'
                attrs['indent'] = indent_level
                head = 'p'

        #FIXME styled links etc. gdocs might not use that...
        #... but we should be able to handle non-span bolding etc.
        elif e.tag == SPAN_TAG:
            # XXX: order can matter; we need
            #   <b><u>command</u><b>
            # not
            #   <u><b>command</b><u>
            # but more generally the minimal coalescing of abutting partially
            # overlapping styles is something that needs to be thought about
            # properly at some point.
            for attr, on_values, html_tags in [
                ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']),
                ('font_style', ['italic'], ['i']),
                ('line_through', [True], ['s']),
                ('text_position', ['sub', 'super'], ['sub', 'sup'])
                value = getattr(sty, attr, None)
                if value:
                    if value not in on_values:
                        log.error("Bad value for %s: %s in %s", attr, value,
            if is_code_font(sty.font_family):
            head = 'span'
        elif e.tag == A_TAG:
            assert e.attrib[ns.xlink('type')] == 'simple'
            head = 'a'
            attrs = dict(href=e.attrib[HREF_ATTR])
            # FIXME the in 'span' check is a bit too general, should use
            # something else to markup textcolor
            body = tidy(whack(lambda x: x in ('span', 'u'), body))
        elif e.tag == BOOKMARK_START_TAG:
            head = 'a'
            attrs = dict(name=e.attrib[TEXT_NAME_ATTR])
            assert (blank(text) and blank(tail)
                    and next(e.itersiblings()).tag == BOOKMARK_END_TAG)
        elif e.tag == TABLE_TAG:
            head = 'table'
            body = parse_table_body(body)
        elif e.tag == TABLE_ROW_TAG:
            head = 'tr'
        elif e.tag == TABLE_CELL_TAG:
            head = 'td'
        #FIXME repetition via table:number-columns-repeated
        #FIXME handle column-groups
        elif e.tag == TABLE_COLUMN_TAG:
            head = 'col'
            sty = context.stys.get(e.attrib.get(ns.table('style-name')))
            if sty and sty.width is not None:
                # XXX this isn't really the column width
                # since google moronically saves this even
                # if set column width is turned off thank you google!
                attrs = dict(style=OrderedDict(width=sty.width))

        elif e.tag == FRAME_TAG:
            # XXX: try to find caption
            # FIXME(alexander): keep figures/tables with captions in context,
            # so that we can produce a lot/loi; add an id for all of them
            inline = e.attrib[ns.text('anchor-type')] == 'as-char'
            width = (
                e.attrib.get(ns.svg('width'))  # pylint: disable=E1101
                or e.attrib['rel-width')])
            # FIXME(alexander): should handle all these, in theory:
            # <>
            # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" )
            assert width.endswith('cm'), \
                'Expected figure width in cm, got %s' % width
            relwidth = float(width[:-2]) / context.stys.textwidth
            head, attrs, body = make_figure(
                # FIXME(alexander): the body[0][1] to access the image
                # will blow up on leading whitespace in the body
                body=list(x for x in body
                          if not (isinstance(x, basestring) and blank(x))),
        elif e.tag == IMAGE_TAG:
            head = 'img'
            attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR]))
            log.warning('Ignoring tag %s', e.tag)
            # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag)
        sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)],
                            tags_from_style, tidy(body))
        if sty:
            if sty.text_align:
                attrs = add_class(attrs, sty.text_align)
            if sty.background_color:
                iadd_style(attrs, 'background-color', sty.background_color)
            if sty.color:
                iadd_style(attrs, 'color', sty.color)
        if e.tag == LIST_TAG:
            if new_context.list_style_type:
                attrs = add_class(attrs, new_context.list_style_type)
        # FIXME additional tidy
        parsed = mkel(head, attrs, sty_tagged)
        if head == 'span' and 'style' in attrs:
            B = Var('B')
            if parsed == ('span', attrs, [('code', {}, B)]):
                parsed = mkel('code', {}, [('span', attrs, B.val)])

        leftover_styles = sty and set(
            sty.active_props()) - set(stys_dealt_with)
        if leftover_styles:
            log.warn('Ignoring style elements: %r in %r "%s"',
                     ([(k, getattr(sty, k)) for k in leftover_styles]), head,
        preprocess.maybe_anchorize_id(head, attrs, sty_tagged)
        yield parsed
        if tail:
            yield tail
def handle_fragment(fragment, indent,
                    transclusions, h_shift, epub_clean, bibliography):
    # pylint: disable=R0911,R0914,R0912,R0913,R0915
    # FIXME(alexander): clean this up a bit, and get rid of pylint muffles
    if isinstance(fragment, basestring):
        return cgi.escape(fragment)

    (tag, attrs, content) = fragment
    if tag in ['script', 'style'] and content:
        content_str, = content
        return NOT_INLINE_TEMPLATE % dict(
            attrs_str=encode_attrs(attrs, transclusions, epub_clean),
                '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')),
    if tag == 'pre':
        return '\n' + highlight.as_html(fragment)

    # special case figures and tables
    if tag == 'figure':
        style = attrs['style'].copy()
        width = style.pop('width', '100%')
        attrs = dict(attrs.items(), style=style)
        # FIXME(alexander): dirty hacks to fixup caption & width
        img = content[-1]
        assert img[0] == 'img'
        img[1].setdefault('style', OrderedDict())['width'] = width
        # put figcaption towards end
        if content[0][0] == 'figcaption':
            content[0], content[-1] = content[-1], content[0]
        if style['display'] == 'inline':
            ATTRS = Var('ATTRS') # pylint: disable=C0103
            assert content[:1] == [('img', ATTRS, [])], \
                "figure does not begin with an img"
            attrs = add_class(ATTRS.val, 'margin')
            # peel of the figure tag for inlined stuff
            # as a hack to make epub/html validate
            # (figures can't occur in all contexts imgs can)
            return handle_fragments([('img', attrs, [])],
    elif tag == 'table':
        colgroups = [el for el in content if el[0] == 'colgroup']
        COLS = Var("COLS") # pylint: disable=C0103
        assert colgroups == [('colgroup', {}, COLS)], \
                "Expected single colgroup in table %s" % content
        # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away
        # imperatively propagate table cell alignment down
        # this is a pretty horrible hack and would blow
        # up nastily if there is attribute aliasing,
        # but deepcopying should kinda make it work
        content = copy.deepcopy(content)
        _propagate_alignment(content, COLS.val)

    elif tag == 'col':
        if not epub_clean:
            attrs = attrs.copy()
            attrs['width'] = attrs['style']['width']
            del attrs['style']
        # cull
        ## return handle_fragments(content, indent)
    # FIXME(alexander): might make more sense to filter (or h-ify) these out
    # elsewhere, but for now this seems not unreasonable
    elif tag == 'title':
        tag = 'h1'
        attrs = add_class(attrs, 'title')
    elif tag == 'subtitle':
        tag = 'h2'
        attrs = add_class(attrs, 'subtitle')
    elif tag in ('CMD', 'LIT'):
        bad_command = None
        cmd_type, = attrs['class']
        # FIXME(alexander): convert tex to html for non-math;
        # convert tex math to MML for epub
        if cmd_type in ('$', 'tex'):
            tex, = content
            if cmd_type == '$':
                tex = r'\(%s\)' % tex
            return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex)
        elif CITE_REX.match(cmd_type):
            if bibliography:
                # post = ('[%s]' % content[1] if len(content) > 1 and content[1]
                #         else '')
                # Post is ignored for the moment
                return _format_citation(cmd_type, content[0], bibliography)
                    'Citation exists, but bibliography is missing')
            bad_command = cmd_type + (':' if content else '')
            docerror.docproblem('Unknown command type:%s' % cmd_type)
    elif epub_clean:
        if tag == 'a' and 'name' in attrs:
            assert len(attrs) == 1
            attrs = {'id': attrs['name']}
        elif tag == 'img':
            attrs = {k: attrs[k] for k in attrs if k not in ('width', 'height')}

    # FIXME(alexander): support continued-list properly in html, by keeping
    # track of numbers of items per list-id and translating it to start

    if tag in H_TAGS:
        if h_shift:
            tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift))

    # generic [tagname].class tags
    if '.' in tag:
        if tag == '.pagebreak':
            tag = 'div.pagebreak' # for whitespace sanitization
        tagname, classname = tag.split('.', 1)
        tag = tagname or 'span'
        attrs = add_class(attrs, classname)

    if tag == 'CMD' and bad_command:
        tag = 'span'
        attrs = {'class': ['bad-command']}
        content = [('u', {}, [bad_command])] +  content
    elif tag == 'ERR':
        tag = 'span'
        attrs = {'class': ['err'], 'title': attrs['info'][0]}

    content_str = handle_fragments(content,
                                   indent='  ' + indent,
    if tag in VOID_TAGS:
        assert not content
        template = "<%(tag)s%(attrs_str)s/>"
    elif tag in INLINE:
        template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>"
    elif '\n' in content_str:
        template = NOT_INLINE_TEMPLATE

    # FIXME(alexander): disgusting hack; fix this properly and
    # use a set representation to start with!
    classes = attrs.get('class')
    if classes:
        attrs = attrs.copy()
        attrs['class'] = sorted(set(classes))

    return template % dict(
        attrs_str=encode_attrs(attrs, transclusions, epub_clean),
def parse_body(xml, context, normalize_transclusion):
    # pylint: disable=R0912,R0915,R0914
    for e in xml:
        text = (e.text or '')
        tail = (e.tail or '')

        # some style properties should be promoted to tags, e.g. underlining
        # and bolding
        tags_from_style = []
        stys_dealt_with = []

        if e.tag in (S_TAG, TAB_TAG):
            yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(ns.text('c'), '1'))
            if tail:
                yield tail

        if e.tag == LINEBREAK_TAG:
            yield mkel('br', {}, [])

        sty = context.stys.get(e.get(STYLE_NAME_ATTR) or
        # handle page breaks
        if sty and sty.par_break:
            assert e.tag in (H_TAG, P_TAG), \
                   "Unexpected page-break in %r" % e.tag
            yield mkel('.pagebreak', {}, [])
        # Handle lists specially
        if e.tag == LIST_TAG:
            new_context = context.bump_list_level(sty)
            new_context = context
        body = list(parse_body(e, new_context, normalize_transclusion))
        assert type(body) is list and not body or type(body[0]) is not list
        attrs = {}
        if text:
            body = [text] + body
        if sty and sty.type.endswith('title'):
            head = sty.type
            body = [plaintextify(body)]
            sty = None
        elif e.tag == H_TAG:
            # skip empty headings; NB: this *must* happen
            # after we extracted eventual page-breaks, which are the only
            # useful information empty headings can contain
            if blank(body):
            head = sty.type
            # FIXME(alexander): keep track of the headings breadcrumbs in
            # context for two reasons
            #  1. to associate errors with specific headings
            #  2. to warn about bad structure e.g. h1 followed by h4,
            #     rather than h2
        elif e.tag == LIST_TAG:
            head = new_context.list_type
            assert head in ('ol', 'ul')
            list_start = new_context.list_start
            if list_start is not None:
                assert head == 'ol'
                attrs['start'] = str(list_start)

            id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101
            if id_ is not None:
                attrs['id'] = id_
            continues = e.attrib.get(ns.text('continue-list'))
            if continues is not None:
                # make this a data attrib, so we can stuff it
                # into the html, which doesn't have direct support
                attrs['data-continue-list'] = continues

        elif e.tag == LIST_ITEM_TAG:
            head = 'li'
        elif e.tag == ANNOTATION_TAG:
            head = 'aside'
            #FIXME: extract content
            if text:
                log.warning('Hey, someone actually specified a %s: %s',
                            e.tag, text)
            if tail:
                yield tail
        elif e.tag == NOTE_TAG:
            # other valid option is 'endnote'
            assert e.attrib[ns.text('note-class')] == 'footnote'
            # skip ahead and exit early; we only represent the note-body
            assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG
            assert len(body) == 1
            yield body[0]
            if tail:
                yield tail
        elif e.tag == NOTE_BODY_TAG:
            head = '.footnote'
            # FIXME(alexander): sucky hack to strip the bogus whitespace
            # google docs enters at the beginning of a footnote for some
            # reason. I should really write a more generic whitespace
            # stripping mechanism in the postprocess module that can recognize
            # consecutive whitespace even if seperated-by/wrapped-in inline
            # tags.
            _, B1, B2, = map(Var, '_, B1, B2'.split(', '))
            SPACED_STR = Var('SPACED_STR', lambda s: (isinstance(s, basestring)
                                                      and re.match(r'\s+', s)))
            if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]:
                body[0][2][0] = SPACED_STR.val.lstrip()
        # FIXME(alexander): add anchors for all paras
        elif e.tag == P_TAG:
            margin = sty.margin_left or sty.text_indent if sty else None
            indent_level = in_indents(margin) if margin else 0
            if indent_level:
                head = '.block'
                attrs['indent'] = indent_level
                head = 'p'

        #FIXME styled links etc. gdocs might not use that...
        #... but we should be able to handle non-span bolding etc.
        elif e.tag == SPAN_TAG:
            # XXX: order can matter; we need
            #   <b><u>command</u><b>
            # not
            #   <u><b>command</b><u>
            # but more generally the minimal coalescing of abutting partially
            # overlapping styles is something that needs to be thought about
            # properly at some point.
            for attr, on_values, html_tags in [
                    ('underline', [True], ['u']),
                    ('font_weight', ['bold'], ['b']),
                    ('font_style', ['italic'], ['i']),
                    ('line_through', [True], ['s']),
                     ['sub', 'super'],
                     ['sub', 'sup'])
                value = getattr(sty, attr, None)
                if value:
                    if value not in on_values:
                        log.error("Bad value for %s: %s in %s",
                                  attr, value, e.tag)
            if is_code_font(sty.font_family):
            head = 'span'
        elif e.tag == A_TAG:
            assert e.attrib[ns.xlink('type')] == 'simple'
            head = 'a'
            attrs = dict(href=e.attrib[HREF_ATTR])
            # FIXME the in 'span' check is a bit too general, should use
            # something else to markup textcolor
            body = tidy(whack(lambda x: x in ('span', 'u'), body))
        elif e.tag == BOOKMARK_START_TAG:
            head = 'a'
            attrs = dict(name=e.attrib[TEXT_NAME_ATTR])
            assert (blank(text) and blank(tail) and
                    next(e.itersiblings()).tag == BOOKMARK_END_TAG)
        elif e.tag == TABLE_TAG:
            head = 'table'
            body = parse_table_body(body)
        elif e.tag == TABLE_ROW_TAG:
            head = 'tr'
        elif e.tag == TABLE_CELL_TAG:
            head = 'td'
        #FIXME repetition via table:number-columns-repeated
        #FIXME handle column-groups
        elif e.tag == TABLE_COLUMN_TAG:
            head = 'col'
            sty = context.stys.get(e.attrib.get(ns.table('style-name')))
            if sty and sty.width is not None:
                # XXX this isn't really the column width
                # since google moronically saves this even
                # if set column width is turned off thank you google!
                attrs = dict(style=OrderedDict(width=sty.width))

        elif e.tag == FRAME_TAG:
            # XXX: try to find caption
            # FIXME(alexander): keep figures/tables with captions in context,
            # so that we can produce a lot/loi; add an id for all of them
            inline = e.attrib[ns.text('anchor-type')] == 'as-char'
            width = (e.attrib.get(ns.svg('width')) # pylint: disable=E1101
                     or e.attrib['rel-width')])
            # FIXME(alexander): should handle all these, in theory:
            # <>
            # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" )
            assert width.endswith('cm'), \
                'Expected figure width in cm, got %s' % width
            relwidth = float(width[:-2]) / context.stys.textwidth
            head, attrs, body = make_figure(
                relwidth=relwidth, inline=inline,
                # FIXME(alexander): the body[0][1] to access the image
                # will blow up on leading whitespace in the body
                body=list(x for x in body
                          if not (isinstance(x, basestring) and blank(x))),
        elif e.tag == IMAGE_TAG:
            head = 'img'
            attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR]))
            log.warning('Ignoring tag %s', e.tag)
            # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag)
        sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)],
                            tags_from_style, tidy(body))
        if sty:
            if sty.text_align:
                attrs = add_class(attrs, sty.text_align)
            if sty.background_color:
                iadd_style(attrs, 'background-color', sty.background_color)
            if sty.color:
                iadd_style(attrs, 'color', sty.color)
        if e.tag == LIST_TAG:
            if new_context.list_style_type:
                attrs = add_class(attrs, new_context.list_style_type)
        # FIXME additional tidy
        parsed = mkel(head, attrs, sty_tagged)
        if head == 'span' and 'style' in attrs:
            B = Var('B')
            if parsed == ('span', attrs, [('code', {}, B)]):
                parsed = mkel('code', {}, [('span', attrs, B.val)])

        leftover_styles = sty and set(sty.active_props()) - set(stys_dealt_with)
        if leftover_styles:
            log.warn('Ignoring style elements: %r in %r "%s"', (
                [(k, getattr(sty, k)) for k in leftover_styles]), head,
        preprocess.maybe_anchorize_id(head, attrs, sty_tagged)
        yield parsed
        if tail:
            yield tail