def parse_table_body(body): #pylint: disable=R0914 def extract_header(elems): attrs = [] for elem in elems: _TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY = map( Var, "_TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY".split(', ')) _BLANK_BODY = Var('_BLANK', blank) if elem == (_TAG, TATTRS, _BLANK_BODY): # empty cell - accept, but do not propagate attrs, apart # from background-color bg = TATTRS.val.get('style', {}).get('background-color') attrs.append({}) if bg: iadd_style(attrs[-1], 'background-color', bg) elif elem in ((_TAG, TATTRS, [('p', PATTRS, [('span', _SATTRS, [('b', _BATTRS, TBODY)])])]), (_TAG, TATTRS, [('p', PATTRS, [('b', _BATTRS, TBODY)])])): attrs.append(merge_attrs(TATTRS.val, PATTRS.val)) else: #not header set return False, [] return True, attrs cols = [el for el in body if el[0] == 'col'] trs = [el for el in body if el[0] == 'tr'] has_header_row, header_attrs = extract_header(trs[0][2]) #rows[0].body if has_header_row: header_row = [] ncols = [] for index, td in enumerate(trs[0][2]): ctag, cattrs, cbody = cols[index] header_row.append(mkel('th', header_attrs[index], td[2])) if 'class' in header_attrs[index]: cattrs = add_class(cattrs, *header_attrs[index]['class']) ncols.append(mkel(ctag, cattrs, cbody)) trs = [mkel('tr', {}, header_row)] + trs[1:] cols = ncols has_header_column, _col_attrs = extract_header( [body[0] for (_, _, body) in trs]) if has_header_column: ntrs = [] for (trtag, trattrs, trbody) in trs: tdtag, tdattrs, tdbody = trbody[0] ntd = mkel(tdtag, add_class(tdattrs, 'headcol'), tdbody) ntrs.append(mkel(trtag, trattrs, [ntd] + trbody[1:])) trs = ntrs return [mkel('colgroup', {}, cols)] + trs
def parse_table_body(body): #pylint: disable=R0914 def extract_header(elems): attrs = [] for elem in elems: _TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY = map( Var, "_TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY".split(', ')) _BLANK_BODY = Var('_BLANK', blank) if elem == (_TAG, TATTRS, _BLANK_BODY): # empty cell - accept, but do not propagate attrs, apart # from background-color bg = TATTRS.val.get('style', {}).get('background-color') attrs.append({}) if bg: iadd_style(attrs[-1], 'background-color', bg) elif elem in ((_TAG, TATTRS, [ ('p', PATTRS, [('span', _SATTRS, [('b', _BATTRS, TBODY)])]) ]), (_TAG, TATTRS, [('p', PATTRS, [('b', _BATTRS, TBODY)])])): attrs.append(merge_attrs(TATTRS.val, PATTRS.val)) else: #not header set return False, [] return True, attrs cols = [el for el in body if el[0] == 'col'] trs = [el for el in body if el[0] == 'tr'] has_header_row, header_attrs = extract_header(trs[0][2]) #rows[0].body if has_header_row: header_row = [] ncols = [] for index, td in enumerate(trs[0][2]): ctag, cattrs, cbody = cols[index] header_row.append(mkel('th', header_attrs[index], td[2])) if 'class' in header_attrs[index]: cattrs = add_class(cattrs, *header_attrs[index]['class']) ncols.append(mkel(ctag, cattrs, cbody)) trs = [mkel('tr', {}, header_row)] + trs[1:] cols = ncols has_header_column, _col_attrs = extract_header( [body[0] for (_, _, body) in trs]) if has_header_column: ntrs = [] for (trtag, trattrs, trbody) in trs: tdtag, tdattrs, tdbody = trbody[0] ntd = mkel(tdtag, add_class(tdattrs, 'headcol'), tdbody) ntrs.append(mkel(trtag, trattrs, [ntd] + trbody[1:])) trs = ntrs return [mkel('colgroup', {}, cols)] + trs
def _propagate_alignment(content, cols): trs = [el for el in content if el[0] == 'tr'] for _, _, tds in trs: assert len(tds) == len(cols), \ "Table row has not enough cells: %s" % tds for cid, col in enumerate(cols): if 'class' in col[1]: attrs = tds[cid][1] # FIXME ugly hack attrs.update(add_class(attrs, *col[1]['class']))
def handle_p(self, e, current_part, in_list=False): attrs = {} pPr = first_of_tag(e, P_PROPS_TAG) jc_class = self.JC_TO_CLASS.get(val(pPr, ns.w('jc'))) if jc_class: attrs = add_class(attrs, jc_class) tag = style_to_tag(val(pPr, ns.w('pStyle')) or '') content = iter(e) if pPr is None else pPr.itersiblings() handle_p = partial(self.handle_p_content, current_part=current_part) ans = mkel(tag, attrs, flatmap(handle_p, content)) left_indent = val(pPr, ns.w('ind'), ns.w('left')) or 0.0 indent = int(round(float(left_indent) / self.default_indent_twips)) if (not in_list) and indent: ans = lift_code(ans) ans = mkel('.block', {'indent': indent}, [ans]) ans = hacky_flatten_block(ans) return ans
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean, bibliography): # pylint: disable=R0911,R0914,R0912,R0913,R0915 # FIXME(alexander): clean this up a bit, and get rid of pylint muffles if isinstance(fragment, basestring): return cgi.escape(fragment) (tag, attrs, content) = fragment if tag in ['script', 'style'] and content: content_str, = content return NOT_INLINE_TEMPLATE % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=_indent( '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')), indent)) if tag == 'pre': return '\n' + highlight.as_html(fragment) # special case figures and tables if tag == 'figure': style = attrs['style'].copy() width = style.pop('width', '100%') attrs = dict(attrs.items(), style=style) # FIXME(alexander): dirty hacks to fixup caption & width img = content[-1] assert img[0] == 'img' img[1].setdefault('style', OrderedDict())['width'] = width # put figcaption towards end if content[0][0] == 'figcaption': content[0], content[-1] = content[-1], content[0] if style['display'] == 'inline': ATTRS = Var('ATTRS') # pylint: disable=C0103 assert content[:1] == [('img', ATTRS, [])], \ "figure does not begin with an img" attrs = add_class(ATTRS.val, 'margin') # peel of the figure tag for inlined stuff # as a hack to make epub/html validate # (figures can't occur in all contexts imgs can) return handle_fragments([('img', attrs, [])], bibliography=bibliography, indent=indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean) elif tag == 'table': colgroups = [el for el in content if el[0] == 'colgroup'] COLS = Var("COLS") # pylint: disable=C0103 assert colgroups == [('colgroup', {}, COLS)], \ "Expected single colgroup in table %s" % content # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away # imperatively propagate table cell alignment down # this is a pretty horrible hack and would blow # up nastily if there is attribute aliasing, # but deepcopying should kinda make it work content = copy.deepcopy(content) _propagate_alignment(content, COLS.val) elif tag == 'col': if not epub_clean: attrs = attrs.copy() attrs['width'] = attrs['style']['width'] del attrs['style'] # cull ## return handle_fragments(content, indent) # FIXME(alexander): might make more sense to filter (or h-ify) these out # elsewhere, but for now this seems not unreasonable elif tag == 'title': tag = 'h1' attrs = add_class(attrs, 'title') elif tag == 'subtitle': tag = 'h2' attrs = add_class(attrs, 'subtitle') elif tag in ('CMD', 'LIT'): bad_command = None cmd_type, = attrs['class'] # FIXME(alexander): convert tex to html for non-math; # convert tex math to MML for epub if cmd_type in ('$', 'tex'): tex, = content if cmd_type == '$': tex = r'\(%s\)' % tex return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex) elif CITE_REX.match(cmd_type): if bibliography: bibliography.cited.add(content[0]) # post = ('[%s]' % content[1] if len(content) > 1 and content[1] # else '') # Post is ignored for the moment return _format_citation(cmd_type, content[0], bibliography) else: docerror.docproblem( 'Citation exists, but bibliography is missing') else: bad_command = cmd_type + (':' if content else '') docerror.docproblem('Unknown command type:%s' % cmd_type) elif epub_clean: if tag == 'a' and 'name' in attrs: assert len(attrs) == 1 attrs = {'id': attrs['name']} elif tag == 'img': attrs = { k: attrs[k] for k in attrs if k not in ('width', 'height') } # FIXME(alexander): support continued-list properly in html, by keeping # track of numbers of items per list-id and translating it to start if tag in H_TAGS: if h_shift: tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift)) # generic [tagname].class tags if '.' in tag: if tag == '.pagebreak': tag = 'div.pagebreak' # for whitespace sanitization tagname, classname = tag.split('.', 1) tag = tagname or 'span' attrs = add_class(attrs, classname) if tag == 'CMD' and bad_command: tag = 'span' attrs = {'class': ['bad-command']} content = [('u', {}, [bad_command])] + content elif tag == 'ERR': tag = 'span' attrs = {'class': ['err'], 'title': attrs['info'][0]} content_str = handle_fragments(content, indent=' ' + indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean, bibliography=bibliography) if tag in VOID_TAGS: assert not content template = "<%(tag)s%(attrs_str)s/>" elif tag in INLINE: template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>" elif '\n' in content_str: template = NOT_INLINE_TEMPLATE else: template = COMPACT_NOT_INLINE_TEMPLATE # FIXME(alexander): disgusting hack; fix this properly and # use a set representation to start with! classes = attrs.get('class') if classes: attrs = attrs.copy() attrs['class'] = sorted(set(classes)) return template % dict(indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=content_str)
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get( ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get( e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var( 'SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = ( e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set( sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ([(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean, bibliography): # pylint: disable=R0911,R0914,R0912,R0913,R0915 # FIXME(alexander): clean this up a bit, and get rid of pylint muffles if isinstance(fragment, basestring): return cgi.escape(fragment) (tag, attrs, content) = fragment if tag in ['script', 'style'] and content: content_str, = content return NOT_INLINE_TEMPLATE % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=_indent( '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')), indent)) if tag == 'pre': return '\n' + highlight.as_html(fragment) # special case figures and tables if tag == 'figure': style = attrs['style'].copy() width = style.pop('width', '100%') attrs = dict(attrs.items(), style=style) # FIXME(alexander): dirty hacks to fixup caption & width img = content[-1] assert img[0] == 'img' img[1].setdefault('style', OrderedDict())['width'] = width # put figcaption towards end if content[0][0] == 'figcaption': content[0], content[-1] = content[-1], content[0] if style['display'] == 'inline': ATTRS = Var('ATTRS') # pylint: disable=C0103 assert content[:1] == [('img', ATTRS, [])], \ "figure does not begin with an img" attrs = add_class(ATTRS.val, 'margin') # peel of the figure tag for inlined stuff # as a hack to make epub/html validate # (figures can't occur in all contexts imgs can) return handle_fragments([('img', attrs, [])], bibliography=bibliography, indent=indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean) elif tag == 'table': colgroups = [el for el in content if el[0] == 'colgroup'] COLS = Var("COLS") # pylint: disable=C0103 assert colgroups == [('colgroup', {}, COLS)], \ "Expected single colgroup in table %s" % content # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away # imperatively propagate table cell alignment down # this is a pretty horrible hack and would blow # up nastily if there is attribute aliasing, # but deepcopying should kinda make it work content = copy.deepcopy(content) _propagate_alignment(content, COLS.val) elif tag == 'col': if not epub_clean: attrs = attrs.copy() attrs['width'] = attrs['style']['width'] del attrs['style'] # cull ## return handle_fragments(content, indent) # FIXME(alexander): might make more sense to filter (or h-ify) these out # elsewhere, but for now this seems not unreasonable elif tag == 'title': tag = 'h1' attrs = add_class(attrs, 'title') elif tag == 'subtitle': tag = 'h2' attrs = add_class(attrs, 'subtitle') elif tag in ('CMD', 'LIT'): bad_command = None cmd_type, = attrs['class'] # FIXME(alexander): convert tex to html for non-math; # convert tex math to MML for epub if cmd_type in ('$', 'tex'): tex, = content if cmd_type == '$': tex = r'\(%s\)' % tex return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex) elif CITE_REX.match(cmd_type): if bibliography: bibliography.cited.add(content[0]) # post = ('[%s]' % content[1] if len(content) > 1 and content[1] # else '') # Post is ignored for the moment return _format_citation(cmd_type, content[0], bibliography) else: docerror.docproblem( 'Citation exists, but bibliography is missing') else: bad_command = cmd_type + (':' if content else '') docerror.docproblem('Unknown command type:%s' % cmd_type) elif epub_clean: if tag == 'a' and 'name' in attrs: assert len(attrs) == 1 attrs = {'id': attrs['name']} elif tag == 'img': attrs = {k: attrs[k] for k in attrs if k not in ('width', 'height')} # FIXME(alexander): support continued-list properly in html, by keeping # track of numbers of items per list-id and translating it to start if tag in H_TAGS: if h_shift: tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift)) # generic [tagname].class tags if '.' in tag: if tag == '.pagebreak': tag = 'div.pagebreak' # for whitespace sanitization tagname, classname = tag.split('.', 1) tag = tagname or 'span' attrs = add_class(attrs, classname) if tag == 'CMD' and bad_command: tag = 'span' attrs = {'class': ['bad-command']} content = [('u', {}, [bad_command])] + content elif tag == 'ERR': tag = 'span' attrs = {'class': ['err'], 'title': attrs['info'][0]} content_str = handle_fragments(content, indent=' ' + indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean, bibliography=bibliography) if tag in VOID_TAGS: assert not content template = "<%(tag)s%(attrs_str)s/>" elif tag in INLINE: template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>" elif '\n' in content_str: template = NOT_INLINE_TEMPLATE else: template = COMPACT_NOT_INLINE_TEMPLATE # FIXME(alexander): disgusting hack; fix this properly and # use a set representation to start with! classes = attrs.get('class') if classes: attrs = attrs.copy() attrs['class'] = sorted(set(classes)) return template % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=content_str)
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get(e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var('SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = (e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set(sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ( [(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail