def unparse_literal(lit, roundtrip=True, plain=False): # pylint: disable=R0911 """Return a string representation of `lit`. - `roundtrip` affects how literals with context-dependent values are hanlded, e.g. when ``roundtrip=False`` then ``Date('today') -> "2014-01-01"`` (instead of ``"today"``). - `plain` controls if rich text content is converted to plaintext (e.g. for pdf or epub metadata) """ # FIXME(alexander): try to break cyclic imports import converter.html_writer if lit is None: return '' # XXX(alexander) if isinstance(lit, basestring): return lit if isinstance(lit, bool): return ('no', 'yes')[lit] if not roundtrip and hasattr(lit, 'to_value'): return lit.to_value() if hasattr(lit, 'to_string'): return lit.to_string() if isinstance(lit, list): # Rich-text if plain: return plaintextify(lit) return converter.html_writer.write_body(lit) assert False, "Unknown literal type %r" % (lit,)
def unparse_literal(lit, roundtrip=True, plain=False): # pylint: disable=R0911 """Return a string representation of `lit`. - `roundtrip` affects how literals with context-dependent values are hanlded, e.g. when ``roundtrip=False`` then ``Date('today') -> "2014-01-01"`` (instead of ``"today"``). - `plain` controls if rich text content is converted to plaintext (e.g. for pdf or epub metadata) """ # FIXME(alexander): try to break cyclic imports import converter.html_writer if lit is None: return '' # XXX(alexander) if isinstance(lit, basestring): return lit if isinstance(lit, bool): return ('no', 'yes')[lit] if not roundtrip and hasattr(lit, 'to_value'): return lit.to_value() if hasattr(lit, 'to_string'): return lit.to_string() if isinstance(lit, list): # Rich-text if plain: return plaintextify(lit) return converter.html_writer.write_body(lit) assert False, "Unknown literal type %r" % (lit, )
def tocify_heading(e, gensym): """Transform a heading into `('h*', {'id':ID}, [STRING])`. This assumes `h*` already has an id or is followed by an anchor. """ # pylint: disable=C0103 h, a, b = e assert h in H_TAGS a, b = lift_anchor_id(a, b, gensym, kill_anchor=True) return (h, {'id': a['id']}, [plaintextify(b)])
def check_supplied(): # pylint: disable=R0912 def try_to_reify(v, parse): try: return parse(v) except (KeyboardInterrupt, SystemExit): raise except Exception as ex: # pylint: disable=W0631 log.info('Meta conversion error on %s, %s', k, ex) error('Not a valid %s format (expected %s)' % ( right_type, TYPE_EXAMPLES[right_type]), k, supplied=meta[k]) for k in meta: canonical_meta[k] = meta[k] # default if k not in self._info: maybe_meants = spellsuggest.spell_suggest( k, self._info.keys()) suggestion = (" (did you mean '%s'?)" % maybe_meants[0] if maybe_meants else '') if k not in ('title', 'subtitle'): error("Unexpected field '%s'%s" % (k, suggestion), k, meta[k]) else: error("This document type does not have a %s" % k, k, meta[k]) continue potential_types = PY_TYPE_TO_TYPESETR_TYPES[type(meta[k])] right_type = self._info[k]['type'] if right_type in potential_types: if right_type == 'bibliography': parsed[k] = try_to_reify(meta[k], Bibliography) else: if 'rich-text' in potential_types: if not isinstance(meta[k], basestring): meta[k] = postprocess.plaintextify(meta[k]) potential_types = ('text',) if potential_types == ('text',): parsed[k] = try_to_reify( meta[k], # pylint: disable=W0640 lambda v: parse_literal(v, right_type)) else: error("Expected meta field '%s:' to be" " of type '%s', not '%s'" % ( k, right_type, potential_types[0]), k, supplied=meta[k])
def latexify(self, ast): # pylint: disable=E0102,R0914,R0915,R0911,R0912 if isinstance(ast, list): return re.sub('\n\n$', '\n', join(*map(self.latexify, ast))) else: node = ast if isinstance(node, basestring): return quote(node) else: assert isinstance(node, tuple) h, a, b = node if h == 'div': # canonicalize pseudo-elements h = a['class'].pop() assert not a['class'] del a['class'] if h[:-1] == 'h': if self.am_inside('list') or self.am_inside('table'): return docwarn( self.latexify(b), 'Cannot have sections inside lists or tables: %r' % postprocess.plaintextify(b)) else: with self.inside('section'): if a: log.warn('heading w/ attr %r', a) labels, b = extract_labels(b) return self.section(h, b, labels) elif h == 'p': ans = nl(self.latexify(b)) if self.am_inside('.footnote') and self.am_inside('table'): return docwarn(ans, 'Multi-paragraph footnotes in tables are' ' unsupported') return nl(ans) elif h == 'span': return self.latexify(b) # XXX elif h in ('ol', 'ul'): ol = partial(self.enumerate_, start=a.get('start'), series=a.get('id'), resume=a.get('data-continue-list')) with self.inside('list'): return nl( freshline({ 'ol': ol, 'ul': itemize}[h]( self.latexify(b)))) elif h == 'li': labels, b = extract_labels(b) labelling = (join(*(map(mklabel, labels) + [' '])) if labels else '') return join(freshline(cmd('item')), labelling, self.latexify(b)) elif h == 'table': nested_table = self.am_inside('table') with self.inside('table'): # pylint: disable=C0103 CLASS_TO_SPEC = {'left': 'P', 'center': 'C', 'right': 'R', 'justify': 'N'} b = b[:] tablecaption = None if b[0][0] == 'caption': with self.inside('caption'): tablecaption = self.latexify(b[0][2]) del b[0] colgroup = [el for el in b if el[0] == 'colgroup'] rows = [el for el in b if el[0] == 'tr'] assert len(colgroup) == 1, \ "Expected single colgroup in table %s" % b cols = colgroup[0][2] colspecs = [] for col_h, col_a, col_b in cols: if col_h != 'col': break assert not col_b coltype = 'P' for cls in CLASS_TO_SPEC: if cls in col_a.get('class', []): coltype = CLASS_TO_SPEC[cls] coltype = "%s{%s}" % (coltype, textwidth_percent( col_a['style']['width'])) colspecs.append(coltype) rows = "\\tabularnewline\n".join( map(self.latexify, rows)) if nested_table and tablecaption: docproblem( "Tables within tables can't have captions;" " outputing caption as normal text", level='warning') ans = join(nl(table(colspecs, rows)), tablecaption) else: ans = table(colspecs, rows, tablecaption) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans elif h == 'col': # FIXME assert False, "Unexpected col" elif h == 'tr': return " & ".join(map(self.latexify, b)) elif h == 'td': if 'headcol' in a.get('class', []): return colh(self.latexify(b)) return self.latexify(b) elif h == 'th': if 'headcol' in a.get('class', []): return rowh(colh(self.latexify(b))) return rowh(self.latexify(b)) elif h == 'figure': b = b[:] if b[0][0] == 'figcaption': with self.inside('caption'): figcaption = self.latexify(b[0][2]) del b[0] else: figcaption = None assert len(b) == 1 and b[0][0] == 'img' img = b[0][1]['src'] inline = False warns = [] if a['style']['display'] == 'inline': if self.am_inside('table'): warns.append([ 'Margin figures not supported in tables, ' 'inserting into table cell']) else: inline = True if inline: if figcaption: warns.append( ['Ignoring figcaption for inline figure:' ' "%s"', figcaption]) ans = marginfigure(img=img) else: fakecaption = figcaption and self.am_inside('table') if fakecaption: warns.append([ "Figures in tables can't have captions; " "outputing caption as normal text"]) # inside blockquotes more complicated figure # environments don't seem to work reliably rawincludegraphics = self.am_inside('blockquote') ans = figure(img=img, classes=a.get('class', []), width=a['style']['width'], figcaption=figcaption, fakecaption=fakecaption, rawincludegraphics=rawincludegraphics) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans if not warns else docwarns(ans, *warns) elif h == 'img': assert False, 'unexpected image' elif h == 'a': if 'name' in a: # we can't do that blindly, because we want to # generate labels for things like lists and headings # this is only a fallback for anchors outside of # 'labelled' envs return cmd('hypertarget', [], [a['name'].lstrip('#'), '']) elif 'href' in a: if a['href'].startswith('#'): return cmd('hyperref', [latexify_href(a['href'][1:])], [self.latexify(b)]) ## # XXX(alexander): handle bare urls specially, because # we want more relaxed linebreaking rules for them. # Note that we're not using \url directly, because # it's not robust and also can't cope with certain # arguments, such as unbalanced '{'/'}'s. Also, even # with fairly aggressive hyphenization params, this is # in in itself not enough to resolve all overfull hbox # issues with urls, although it's not 100% clear to me # why. elif b and a['href'] in (b[0], url_fix(b[0])): # XXX(alexander): use url_fixed version here? return urldef(a['href'], self.urldefs) else: ans = cmd('href', [], [latexify_href(a['href']), self.latexify(b)]) if b[0].startswith('http'): ans = docwarn( ans, 'Suspicious link with body/href' ' mismatch: %r != %r' % ( a['href'].encode('utf-8'), b[0])) return ans else: assert False, 'Malformed link: %s' % ((h, a, b),) elif h == 'aside': return cmd('comment', [], [self.latexify(b)]) elif h in ('b', 'i', 'u', 's'): assert not a, 'unexpected <%s %r' % (h, a) return self.handle_emphasis(h, b) elif h == 'code': #FIXME: write something more specialized return cmd('texttt', [], [self.latexify(b)]) elif h == 'sup': return cmd('textsuperscript', [], [self.latexify(b)]) elif h == 'sub': return cmd('textsubscript', [], [self.latexify(b)]) elif h == '.footnote': with self.inside('.footnote'): if self.am_inside('caption'): self.post_float_yuck.append(cmd('footnotetext', [], [self.latexify(b)])) return cmd(r'protect\footnotemark', [], []) else: return cmd('footnote', [], [self.latexify(b)]) elif h == '.pagebreak': return nl(cmd('clearpage', [], [self.latexify(b)])) elif h == 'br': assert a == {} assert b == [] return nl(cmd('newline')) elif h == 'blockquote': with self.inside('blockquote'): return blockquote(self.latexify(b)) elif (h == 'footer' and b == [Seq['cite', :]] and self.am_inside('blockquote')): return nl(cmd('attrib', [], [self.latexify(b[0][2])])) elif node == ('CMD', {'class': ['$']}, b): return join('$', b[0], '$') elif node == ('CMD', {'class': [Var('CITE', CITE_REX.match)]}, b): return self.munge_cite(node, b) elif node == ('CMD', {'class': ['tex']}, b): return b[0] elif h in ('CMD', 'LIT'): return self.bad_command(*node) elif h == 'pre': return highlight.as_latex(node) elif h == 'wbr': return '{}' else: #FIXME(alexander): set 1 as error-code? log.error('Unexpected tag: %s %r %r', h, a, b) return join("")
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get( ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get( e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var( 'SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = ( e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set( sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ([(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail
def latexify(self, ast): # pylint: disable=E0102,R0914,R0915,R0911,R0912 if isinstance(ast, list): return re.sub('\n\n$', '\n', join(*map(self.latexify, ast))) else: node = ast if isinstance(node, basestring): return quote(node) else: assert isinstance(node, tuple) h, a, b = node if h == 'div': # canonicalize pseudo-elements h = a['class'].pop() assert not a['class'] del a['class'] if h[:-1] == 'h': if self.am_inside('list') or self.am_inside('table'): return docwarn( self.latexify(b), 'Cannot have sections inside lists or tables: %r' % postprocess.plaintextify(b)) else: with self.inside('section'): if a: log.warn('heading w/ attr %r', a) labels, b = extract_labels(b) return self.section(h, b, labels) elif h == 'p': ans = nl(self.latexify(b)) if self.am_inside('.footnote') and self.am_inside('table'): return docwarn( ans, 'Multi-paragraph footnotes in tables are' ' unsupported') return nl(ans) elif h == 'span': return self.latexify(b) # XXX elif h in ('ol', 'ul'): ol = partial(self.enumerate_, start=a.get('start'), series=a.get('id'), resume=a.get('data-continue-list')) with self.inside('list'): return nl( freshline({ 'ol': ol, 'ul': itemize }[h](self.latexify(b)))) elif h == 'li': labels, b = extract_labels(b) labelling = (join(*(map(mklabel, labels) + [' '])) if labels else '') return join(freshline(cmd('item')), labelling, self.latexify(b)) elif h == 'table': nested_table = self.am_inside('table') with self.inside('table'): # pylint: disable=C0103 CLASS_TO_SPEC = { 'left': 'P', 'center': 'C', 'right': 'R', 'justify': 'N' } b = b[:] tablecaption = None if b[0][0] == 'caption': with self.inside('caption'): tablecaption = self.latexify(b[0][2]) del b[0] colgroup = [el for el in b if el[0] == 'colgroup'] rows = [el for el in b if el[0] == 'tr'] assert len(colgroup) == 1, \ "Expected single colgroup in table %s" % b cols = colgroup[0][2] colspecs = [] for col_h, col_a, col_b in cols: if col_h != 'col': break assert not col_b coltype = 'P' for cls in CLASS_TO_SPEC: if cls in col_a.get('class', []): coltype = CLASS_TO_SPEC[cls] coltype = "%s{%s}" % (coltype, textwidth_percent( col_a['style']['width'])) colspecs.append(coltype) rows = "\\tabularnewline\n".join( map(self.latexify, rows)) if nested_table and tablecaption: docproblem( "Tables within tables can't have captions;" " outputing caption as normal text", level='warning') ans = join(nl(table(colspecs, rows)), tablecaption) else: ans = table(colspecs, rows, tablecaption) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans elif h == 'col': # FIXME assert False, "Unexpected col" elif h == 'tr': return " & ".join(map(self.latexify, b)) elif h == 'td': if 'headcol' in a.get('class', []): return colh(self.latexify(b)) return self.latexify(b) elif h == 'th': if 'headcol' in a.get('class', []): return rowh(colh(self.latexify(b))) return rowh(self.latexify(b)) elif h == 'figure': b = b[:] if b[0][0] == 'figcaption': with self.inside('caption'): figcaption = self.latexify(b[0][2]) del b[0] else: figcaption = None assert len(b) == 1 and b[0][0] == 'img' img = b[0][1]['src'] inline = False warns = [] if a['style']['display'] == 'inline': if self.am_inside('table'): warns.append([ 'Margin figures not supported in tables, ' 'inserting into table cell' ]) else: inline = True if inline: if figcaption: warns.append([ 'Ignoring figcaption for inline figure:' ' "%s"', figcaption ]) ans = marginfigure(img=img) else: fakecaption = figcaption and self.am_inside('table') if fakecaption: warns.append([ "Figures in tables can't have captions; " "outputing caption as normal text" ]) # inside blockquotes more complicated figure # environments don't seem to work reliably rawincludegraphics = self.am_inside('blockquote') ans = figure(img=img, classes=a.get('class', []), width=a['style']['width'], figcaption=figcaption, fakecaption=fakecaption, rawincludegraphics=rawincludegraphics) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans if not warns else docwarns(ans, *warns) elif h == 'img': assert False, 'unexpected image' elif h == 'a': if 'name' in a: # we can't do that blindly, because we want to # generate labels for things like lists and headings # this is only a fallback for anchors outside of # 'labelled' envs return cmd('hypertarget', [], [a['name'].lstrip('#'), '']) elif 'href' in a: if a['href'].startswith('#'): return cmd('hyperref', [latexify_href(a['href'][1:])], [self.latexify(b)]) ## # XXX(alexander): handle bare urls specially, because # we want more relaxed linebreaking rules for them. # Note that we're not using \url directly, because # it's not robust and also can't cope with certain # arguments, such as unbalanced '{'/'}'s. Also, even # with fairly aggressive hyphenization params, this is # in in itself not enough to resolve all overfull hbox # issues with urls, although it's not 100% clear to me # why. elif b and a['href'] in (b[0], url_fix(b[0])): # XXX(alexander): use url_fixed version here? return urldef(a['href'], self.urldefs) else: ans = cmd( 'href', [], [latexify_href(a['href']), self.latexify(b)]) if b[0].startswith('http'): ans = docwarn( ans, 'Suspicious link with body/href' ' mismatch: %r != %r' % (a['href'].encode('utf-8'), b[0])) return ans else: assert False, 'Malformed link: %s' % ((h, a, b), ) elif h == 'aside': return cmd('comment', [], [self.latexify(b)]) elif h in ('b', 'i', 'u', 's'): assert not a, 'unexpected <%s %r' % (h, a) return self.handle_emphasis(h, b) elif h == 'code': #FIXME: write something more specialized return cmd('texttt', [], [self.latexify(b)]) elif h == 'sup': return cmd('textsuperscript', [], [self.latexify(b)]) elif h == 'sub': return cmd('textsubscript', [], [self.latexify(b)]) elif h == '.footnote': with self.inside('.footnote'): if self.am_inside('caption'): self.post_float_yuck.append( cmd('footnotetext', [], [self.latexify(b)])) return cmd(r'protect\footnotemark', [], []) else: return cmd('footnote', [], [self.latexify(b)]) elif h == '.pagebreak': return nl(cmd('clearpage', [], [self.latexify(b)])) elif h == 'br': assert a == {} assert b == [] return nl(cmd('newline')) elif h == 'blockquote': with self.inside('blockquote'): return blockquote(self.latexify(b)) elif (h == 'footer' and b == [Seq['cite', :]] and self.am_inside('blockquote')): return nl(cmd('attrib', [], [self.latexify(b[0][2])])) elif node == ('CMD', {'class': ['$']}, b): return join('$', b[0], '$') elif node == ('CMD', { 'class': [Var('CITE', CITE_REX.match)] }, b): return self.munge_cite(node, b) elif node == ('CMD', {'class': ['tex']}, b): return b[0] elif h in ('CMD', 'LIT'): return self.bad_command(*node) elif h == 'pre': return highlight.as_latex(node) elif h == 'wbr': return '{}' else: #FIXME(alexander): set 1 as error-code? log.error('Unexpected tag: %s %r %r', h, a, b) return join("")
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get(e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var('SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = (e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set(sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ( [(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail