def postprocess(raw_body, transclusions, bibliography=None, asides=False): citations = set() # kill comments early, because they can mess things up # (e.g. by splitting commands or citations) # FIXME(alexander): comments should probably be out-of-band if not asides: raw_body = whack('aside'.__eq__, raw_body, kill_body=True) # FIXME(alexander): investigate the performance impact of this final tidy # -- it's needed for tidying up stuff that happened after commandification # (currently that only affects blockquotes) raw_parsed_body = tidy( space_normalize( list( unwrap_figures( captionize( underlines_to_commands( parse_cites(coalesce(raw_body), bib_entries=getattr( bibliography, 'entries', {}), collect_cite=citations.add))))))) unaugmented_head, body = extract_meta(raw_parsed_body, transclusions) if citations: if 'bibliography' not in unaugmented_head: docproblem(MISSING_BIBLIOGRAPHY, sorted(citations)[0]) return unaugmented_head, body
def bad_command(self, head, attrs, body): assert head in ('LIT', 'CMD') bad_cmd = attrs['class'][0] n = docproblem('Unknown command: {}', bad_cmd) warning = small(red(self.latexify( u"CONVERSION ERROR: Not a valid command" u" (only use underlining for commands): “"))) the_cmd = self.latexify( mkel('u', {}, [bad_cmd + (':' if head == 'CMD' else '')])) warning_end = small(red(self.latexify(u'”'))) return join(problem_anchor(n, join(warning, the_cmd, warning_end)), self.latexify(body))
def postprocess(raw_body, transclusions, bibliography=None, asides=False): citations = set() # kill comments early, because they can mess things up # (e.g. by splitting commands or citations) # FIXME(alexander): comments should probably be out-of-band if not asides: raw_body = whack('aside'.__eq__, raw_body, kill_body=True) # FIXME(alexander): investigate the performance impact of this final tidy # -- it's needed for tidying up stuff that happened after commandification # (currently that only affects blockquotes) raw_parsed_body = tidy( space_normalize(list(unwrap_figures(captionize( underlines_to_commands( parse_cites(coalesce(raw_body), bib_entries=getattr(bibliography, 'entries', {}), collect_cite=citations.add))))))) unaugmented_head, body = extract_meta(raw_parsed_body, transclusions) if citations: if 'bibliography' not in unaugmented_head: docproblem(MISSING_BIBLIOGRAPHY, sorted(citations)[0]) return unaugmented_head, body
def bad_command(self, head, attrs, body): assert head in ('LIT', 'CMD') bad_cmd = attrs['class'][0] n = docproblem('Unknown command: {}', bad_cmd) warning = small( red( self.latexify(u"CONVERSION ERROR: Not a valid command" u" (only use underlining for commands): “"))) the_cmd = self.latexify( mkel('u', {}, [bad_cmd + (':' if head == 'CMD' else '')])) warning_end = small(red(self.latexify(u'”'))) return join(problem_anchor(n, join(warning, the_cmd, warning_end)), self.latexify(body))
def captionize(body): CBODY, TAG, PATTRS, FATTRS, FBODY = map( Var, 'CBODY, TAG, PATTRS, FATTRS, FBODY'.split(', ')) ans = [] for e1, e2 in window(body, 2): if e2 in (varcmd('caption', CBODY), ('p', PATTRS, [varcmd('caption', CBODY)])): #XXX(alexander): the right way would probably be to normalize # justify/left away before we get here. e1_is_figure = ((e1 == ('p', PATTRS, [(TAG, FATTRS, FBODY)]) and PATTRS.val in ({}, {'class': ['justify']}, {'class': ['left']}) or e1 == (TAG, FATTRS, FBODY)) and TAG.val in ('table', 'figure')) if not e1_is_figure: if TAG.match and TAG.val in H_TAGS: docproblem(CAPTION_AFTER_HEADING, plaintextify(CBODY.val), plaintextify(FBODY.val)) else: docproblem(CAPTION_AFTER_NON_FLOAT, plaintextify(CBODY.val)) continue if PATTRS.match and PATTRS.val: log.warn( 'Unexpected attrs in paragraph wrapping the caption: %r', PATTRS.val) ans[-1] = (TAG.val, FATTRS.val, [('figcaption' if TAG.val == 'figure' else 'caption', {}, CBODY.val)] + captionize(FBODY.val)) elif e2 == (TAG, FATTRS, FBODY): ans.append(mkel(TAG.val, FATTRS.val, captionize(FBODY.val))) else: ans.append(e2) return ans
def captionize(body): CBODY, TAG, PATTRS, FATTRS, FBODY = map( Var, 'CBODY, TAG, PATTRS, FATTRS, FBODY'.split(', ')) ans = [] for e1, e2 in window(body, 2): if e2 in (varcmd('caption', CBODY), ('p', PATTRS, [varcmd('caption', CBODY)])): #XXX(alexander): the right way would probably be to normalize # justify/left away before we get here. e1_is_figure = ((e1 == ('p', PATTRS, [(TAG, FATTRS, FBODY)]) and PATTRS.val in ({}, { 'class': ['justify'] }, { 'class': ['left'] }) or e1 == (TAG, FATTRS, FBODY)) and TAG.val in ('table', 'figure')) if not e1_is_figure: if TAG.match and TAG.val in H_TAGS: docproblem(CAPTION_AFTER_HEADING, plaintextify(CBODY.val), plaintextify(FBODY.val)) else: docproblem(CAPTION_AFTER_NON_FLOAT, plaintextify(CBODY.val)) continue if PATTRS.match and PATTRS.val: log.warn( 'Unexpected attrs in paragraph wrapping the caption: %r', PATTRS.val) ans[-1] = (TAG.val, FATTRS.val, [('figcaption' if TAG.val == 'figure' else 'caption', {}, CBODY.val)] + captionize(FBODY.val)) elif e2 == (TAG, FATTRS, FBODY): ans.append(mkel(TAG.val, FATTRS.val, captionize(FBODY.val))) else: ans.append(e2) return ans
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean, bibliography): # pylint: disable=R0911,R0914,R0912,R0913,R0915 # FIXME(alexander): clean this up a bit, and get rid of pylint muffles if isinstance(fragment, basestring): return cgi.escape(fragment) (tag, attrs, content) = fragment if tag in ['script', 'style'] and content: content_str, = content return NOT_INLINE_TEMPLATE % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=_indent( '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')), indent)) if tag == 'pre': return '\n' + highlight.as_html(fragment) # special case figures and tables if tag == 'figure': style = attrs['style'].copy() width = style.pop('width', '100%') attrs = dict(attrs.items(), style=style) # FIXME(alexander): dirty hacks to fixup caption & width img = content[-1] assert img[0] == 'img' img[1].setdefault('style', OrderedDict())['width'] = width # put figcaption towards end if content[0][0] == 'figcaption': content[0], content[-1] = content[-1], content[0] if style['display'] == 'inline': ATTRS = Var('ATTRS') # pylint: disable=C0103 assert content[:1] == [('img', ATTRS, [])], \ "figure does not begin with an img" attrs = add_class(ATTRS.val, 'margin') # peel of the figure tag for inlined stuff # as a hack to make epub/html validate # (figures can't occur in all contexts imgs can) return handle_fragments([('img', attrs, [])], bibliography=bibliography, indent=indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean) elif tag == 'table': colgroups = [el for el in content if el[0] == 'colgroup'] COLS = Var("COLS") # pylint: disable=C0103 assert colgroups == [('colgroup', {}, COLS)], \ "Expected single colgroup in table %s" % content # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away # imperatively propagate table cell alignment down # this is a pretty horrible hack and would blow # up nastily if there is attribute aliasing, # but deepcopying should kinda make it work content = copy.deepcopy(content) _propagate_alignment(content, COLS.val) elif tag == 'col': if not epub_clean: attrs = attrs.copy() attrs['width'] = attrs['style']['width'] del attrs['style'] # cull ## return handle_fragments(content, indent) # FIXME(alexander): might make more sense to filter (or h-ify) these out # elsewhere, but for now this seems not unreasonable elif tag == 'title': tag = 'h1' attrs = add_class(attrs, 'title') elif tag == 'subtitle': tag = 'h2' attrs = add_class(attrs, 'subtitle') elif tag in ('CMD', 'LIT'): bad_command = None cmd_type, = attrs['class'] # FIXME(alexander): convert tex to html for non-math; # convert tex math to MML for epub if cmd_type in ('$', 'tex'): tex, = content if cmd_type == '$': tex = r'\(%s\)' % tex return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex) elif CITE_REX.match(cmd_type): if bibliography: bibliography.cited.add(content[0]) # post = ('[%s]' % content[1] if len(content) > 1 and content[1] # else '') # Post is ignored for the moment return _format_citation(cmd_type, content[0], bibliography) else: docerror.docproblem( 'Citation exists, but bibliography is missing') else: bad_command = cmd_type + (':' if content else '') docerror.docproblem('Unknown command type:%s' % cmd_type) elif epub_clean: if tag == 'a' and 'name' in attrs: assert len(attrs) == 1 attrs = {'id': attrs['name']} elif tag == 'img': attrs = { k: attrs[k] for k in attrs if k not in ('width', 'height') } # FIXME(alexander): support continued-list properly in html, by keeping # track of numbers of items per list-id and translating it to start if tag in H_TAGS: if h_shift: tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift)) # generic [tagname].class tags if '.' in tag: if tag == '.pagebreak': tag = 'div.pagebreak' # for whitespace sanitization tagname, classname = tag.split('.', 1) tag = tagname or 'span' attrs = add_class(attrs, classname) if tag == 'CMD' and bad_command: tag = 'span' attrs = {'class': ['bad-command']} content = [('u', {}, [bad_command])] + content elif tag == 'ERR': tag = 'span' attrs = {'class': ['err'], 'title': attrs['info'][0]} content_str = handle_fragments(content, indent=' ' + indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean, bibliography=bibliography) if tag in VOID_TAGS: assert not content template = "<%(tag)s%(attrs_str)s/>" elif tag in INLINE: template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>" elif '\n' in content_str: template = NOT_INLINE_TEMPLATE else: template = COMPACT_NOT_INLINE_TEMPLATE # FIXME(alexander): disgusting hack; fix this properly and # use a set representation to start with! classes = attrs.get('class') if classes: attrs = attrs.copy() attrs['class'] = sorted(set(classes)) return template % dict(indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=content_str)
def latexify(self, ast): # pylint: disable=E0102,R0914,R0915,R0911,R0912 if isinstance(ast, list): return re.sub('\n\n$', '\n', join(*map(self.latexify, ast))) else: node = ast if isinstance(node, basestring): return quote(node) else: assert isinstance(node, tuple) h, a, b = node if h == 'div': # canonicalize pseudo-elements h = a['class'].pop() assert not a['class'] del a['class'] if h[:-1] == 'h': if self.am_inside('list') or self.am_inside('table'): return docwarn( self.latexify(b), 'Cannot have sections inside lists or tables: %r' % postprocess.plaintextify(b)) else: with self.inside('section'): if a: log.warn('heading w/ attr %r', a) labels, b = extract_labels(b) return self.section(h, b, labels) elif h == 'p': ans = nl(self.latexify(b)) if self.am_inside('.footnote') and self.am_inside('table'): return docwarn(ans, 'Multi-paragraph footnotes in tables are' ' unsupported') return nl(ans) elif h == 'span': return self.latexify(b) # XXX elif h in ('ol', 'ul'): ol = partial(self.enumerate_, start=a.get('start'), series=a.get('id'), resume=a.get('data-continue-list')) with self.inside('list'): return nl( freshline({ 'ol': ol, 'ul': itemize}[h]( self.latexify(b)))) elif h == 'li': labels, b = extract_labels(b) labelling = (join(*(map(mklabel, labels) + [' '])) if labels else '') return join(freshline(cmd('item')), labelling, self.latexify(b)) elif h == 'table': nested_table = self.am_inside('table') with self.inside('table'): # pylint: disable=C0103 CLASS_TO_SPEC = {'left': 'P', 'center': 'C', 'right': 'R', 'justify': 'N'} b = b[:] tablecaption = None if b[0][0] == 'caption': with self.inside('caption'): tablecaption = self.latexify(b[0][2]) del b[0] colgroup = [el for el in b if el[0] == 'colgroup'] rows = [el for el in b if el[0] == 'tr'] assert len(colgroup) == 1, \ "Expected single colgroup in table %s" % b cols = colgroup[0][2] colspecs = [] for col_h, col_a, col_b in cols: if col_h != 'col': break assert not col_b coltype = 'P' for cls in CLASS_TO_SPEC: if cls in col_a.get('class', []): coltype = CLASS_TO_SPEC[cls] coltype = "%s{%s}" % (coltype, textwidth_percent( col_a['style']['width'])) colspecs.append(coltype) rows = "\\tabularnewline\n".join( map(self.latexify, rows)) if nested_table and tablecaption: docproblem( "Tables within tables can't have captions;" " outputing caption as normal text", level='warning') ans = join(nl(table(colspecs, rows)), tablecaption) else: ans = table(colspecs, rows, tablecaption) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans elif h == 'col': # FIXME assert False, "Unexpected col" elif h == 'tr': return " & ".join(map(self.latexify, b)) elif h == 'td': if 'headcol' in a.get('class', []): return colh(self.latexify(b)) return self.latexify(b) elif h == 'th': if 'headcol' in a.get('class', []): return rowh(colh(self.latexify(b))) return rowh(self.latexify(b)) elif h == 'figure': b = b[:] if b[0][0] == 'figcaption': with self.inside('caption'): figcaption = self.latexify(b[0][2]) del b[0] else: figcaption = None assert len(b) == 1 and b[0][0] == 'img' img = b[0][1]['src'] inline = False warns = [] if a['style']['display'] == 'inline': if self.am_inside('table'): warns.append([ 'Margin figures not supported in tables, ' 'inserting into table cell']) else: inline = True if inline: if figcaption: warns.append( ['Ignoring figcaption for inline figure:' ' "%s"', figcaption]) ans = marginfigure(img=img) else: fakecaption = figcaption and self.am_inside('table') if fakecaption: warns.append([ "Figures in tables can't have captions; " "outputing caption as normal text"]) # inside blockquotes more complicated figure # environments don't seem to work reliably rawincludegraphics = self.am_inside('blockquote') ans = figure(img=img, classes=a.get('class', []), width=a['style']['width'], figcaption=figcaption, fakecaption=fakecaption, rawincludegraphics=rawincludegraphics) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans if not warns else docwarns(ans, *warns) elif h == 'img': assert False, 'unexpected image' elif h == 'a': if 'name' in a: # we can't do that blindly, because we want to # generate labels for things like lists and headings # this is only a fallback for anchors outside of # 'labelled' envs return cmd('hypertarget', [], [a['name'].lstrip('#'), '']) elif 'href' in a: if a['href'].startswith('#'): return cmd('hyperref', [latexify_href(a['href'][1:])], [self.latexify(b)]) ## # XXX(alexander): handle bare urls specially, because # we want more relaxed linebreaking rules for them. # Note that we're not using \url directly, because # it's not robust and also can't cope with certain # arguments, such as unbalanced '{'/'}'s. Also, even # with fairly aggressive hyphenization params, this is # in in itself not enough to resolve all overfull hbox # issues with urls, although it's not 100% clear to me # why. elif b and a['href'] in (b[0], url_fix(b[0])): # XXX(alexander): use url_fixed version here? return urldef(a['href'], self.urldefs) else: ans = cmd('href', [], [latexify_href(a['href']), self.latexify(b)]) if b[0].startswith('http'): ans = docwarn( ans, 'Suspicious link with body/href' ' mismatch: %r != %r' % ( a['href'].encode('utf-8'), b[0])) return ans else: assert False, 'Malformed link: %s' % ((h, a, b),) elif h == 'aside': return cmd('comment', [], [self.latexify(b)]) elif h in ('b', 'i', 'u', 's'): assert not a, 'unexpected <%s %r' % (h, a) return self.handle_emphasis(h, b) elif h == 'code': #FIXME: write something more specialized return cmd('texttt', [], [self.latexify(b)]) elif h == 'sup': return cmd('textsuperscript', [], [self.latexify(b)]) elif h == 'sub': return cmd('textsubscript', [], [self.latexify(b)]) elif h == '.footnote': with self.inside('.footnote'): if self.am_inside('caption'): self.post_float_yuck.append(cmd('footnotetext', [], [self.latexify(b)])) return cmd(r'protect\footnotemark', [], []) else: return cmd('footnote', [], [self.latexify(b)]) elif h == '.pagebreak': return nl(cmd('clearpage', [], [self.latexify(b)])) elif h == 'br': assert a == {} assert b == [] return nl(cmd('newline')) elif h == 'blockquote': with self.inside('blockquote'): return blockquote(self.latexify(b)) elif (h == 'footer' and b == [Seq['cite', :]] and self.am_inside('blockquote')): return nl(cmd('attrib', [], [self.latexify(b[0][2])])) elif node == ('CMD', {'class': ['$']}, b): return join('$', b[0], '$') elif node == ('CMD', {'class': [Var('CITE', CITE_REX.match)]}, b): return self.munge_cite(node, b) elif node == ('CMD', {'class': ['tex']}, b): return b[0] elif h in ('CMD', 'LIT'): return self.bad_command(*node) elif h == 'pre': return highlight.as_latex(node) elif h == 'wbr': return '{}' else: #FIXME(alexander): set 1 as error-code? log.error('Unexpected tag: %s %r %r', h, a, b) return join("")
def docwarns(latex_body, *warnings): ns = [docproblem(*warning, level='warning') for warning in warnings] return reduce_right(problem_anchor, ns, latex_body)
def latexify(self, ast): # pylint: disable=E0102,R0914,R0915,R0911,R0912 if isinstance(ast, list): return re.sub('\n\n$', '\n', join(*map(self.latexify, ast))) else: node = ast if isinstance(node, basestring): return quote(node) else: assert isinstance(node, tuple) h, a, b = node if h == 'div': # canonicalize pseudo-elements h = a['class'].pop() assert not a['class'] del a['class'] if h[:-1] == 'h': if self.am_inside('list') or self.am_inside('table'): return docwarn( self.latexify(b), 'Cannot have sections inside lists or tables: %r' % postprocess.plaintextify(b)) else: with self.inside('section'): if a: log.warn('heading w/ attr %r', a) labels, b = extract_labels(b) return self.section(h, b, labels) elif h == 'p': ans = nl(self.latexify(b)) if self.am_inside('.footnote') and self.am_inside('table'): return docwarn( ans, 'Multi-paragraph footnotes in tables are' ' unsupported') return nl(ans) elif h == 'span': return self.latexify(b) # XXX elif h in ('ol', 'ul'): ol = partial(self.enumerate_, start=a.get('start'), series=a.get('id'), resume=a.get('data-continue-list')) with self.inside('list'): return nl( freshline({ 'ol': ol, 'ul': itemize }[h](self.latexify(b)))) elif h == 'li': labels, b = extract_labels(b) labelling = (join(*(map(mklabel, labels) + [' '])) if labels else '') return join(freshline(cmd('item')), labelling, self.latexify(b)) elif h == 'table': nested_table = self.am_inside('table') with self.inside('table'): # pylint: disable=C0103 CLASS_TO_SPEC = { 'left': 'P', 'center': 'C', 'right': 'R', 'justify': 'N' } b = b[:] tablecaption = None if b[0][0] == 'caption': with self.inside('caption'): tablecaption = self.latexify(b[0][2]) del b[0] colgroup = [el for el in b if el[0] == 'colgroup'] rows = [el for el in b if el[0] == 'tr'] assert len(colgroup) == 1, \ "Expected single colgroup in table %s" % b cols = colgroup[0][2] colspecs = [] for col_h, col_a, col_b in cols: if col_h != 'col': break assert not col_b coltype = 'P' for cls in CLASS_TO_SPEC: if cls in col_a.get('class', []): coltype = CLASS_TO_SPEC[cls] coltype = "%s{%s}" % (coltype, textwidth_percent( col_a['style']['width'])) colspecs.append(coltype) rows = "\\tabularnewline\n".join( map(self.latexify, rows)) if nested_table and tablecaption: docproblem( "Tables within tables can't have captions;" " outputing caption as normal text", level='warning') ans = join(nl(table(colspecs, rows)), tablecaption) else: ans = table(colspecs, rows, tablecaption) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans elif h == 'col': # FIXME assert False, "Unexpected col" elif h == 'tr': return " & ".join(map(self.latexify, b)) elif h == 'td': if 'headcol' in a.get('class', []): return colh(self.latexify(b)) return self.latexify(b) elif h == 'th': if 'headcol' in a.get('class', []): return rowh(colh(self.latexify(b))) return rowh(self.latexify(b)) elif h == 'figure': b = b[:] if b[0][0] == 'figcaption': with self.inside('caption'): figcaption = self.latexify(b[0][2]) del b[0] else: figcaption = None assert len(b) == 1 and b[0][0] == 'img' img = b[0][1]['src'] inline = False warns = [] if a['style']['display'] == 'inline': if self.am_inside('table'): warns.append([ 'Margin figures not supported in tables, ' 'inserting into table cell' ]) else: inline = True if inline: if figcaption: warns.append([ 'Ignoring figcaption for inline figure:' ' "%s"', figcaption ]) ans = marginfigure(img=img) else: fakecaption = figcaption and self.am_inside('table') if fakecaption: warns.append([ "Figures in tables can't have captions; " "outputing caption as normal text" ]) # inside blockquotes more complicated figure # environments don't seem to work reliably rawincludegraphics = self.am_inside('blockquote') ans = figure(img=img, classes=a.get('class', []), width=a['style']['width'], figcaption=figcaption, fakecaption=fakecaption, rawincludegraphics=rawincludegraphics) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans if not warns else docwarns(ans, *warns) elif h == 'img': assert False, 'unexpected image' elif h == 'a': if 'name' in a: # we can't do that blindly, because we want to # generate labels for things like lists and headings # this is only a fallback for anchors outside of # 'labelled' envs return cmd('hypertarget', [], [a['name'].lstrip('#'), '']) elif 'href' in a: if a['href'].startswith('#'): return cmd('hyperref', [latexify_href(a['href'][1:])], [self.latexify(b)]) ## # XXX(alexander): handle bare urls specially, because # we want more relaxed linebreaking rules for them. # Note that we're not using \url directly, because # it's not robust and also can't cope with certain # arguments, such as unbalanced '{'/'}'s. Also, even # with fairly aggressive hyphenization params, this is # in in itself not enough to resolve all overfull hbox # issues with urls, although it's not 100% clear to me # why. elif b and a['href'] in (b[0], url_fix(b[0])): # XXX(alexander): use url_fixed version here? return urldef(a['href'], self.urldefs) else: ans = cmd( 'href', [], [latexify_href(a['href']), self.latexify(b)]) if b[0].startswith('http'): ans = docwarn( ans, 'Suspicious link with body/href' ' mismatch: %r != %r' % (a['href'].encode('utf-8'), b[0])) return ans else: assert False, 'Malformed link: %s' % ((h, a, b), ) elif h == 'aside': return cmd('comment', [], [self.latexify(b)]) elif h in ('b', 'i', 'u', 's'): assert not a, 'unexpected <%s %r' % (h, a) return self.handle_emphasis(h, b) elif h == 'code': #FIXME: write something more specialized return cmd('texttt', [], [self.latexify(b)]) elif h == 'sup': return cmd('textsuperscript', [], [self.latexify(b)]) elif h == 'sub': return cmd('textsubscript', [], [self.latexify(b)]) elif h == '.footnote': with self.inside('.footnote'): if self.am_inside('caption'): self.post_float_yuck.append( cmd('footnotetext', [], [self.latexify(b)])) return cmd(r'protect\footnotemark', [], []) else: return cmd('footnote', [], [self.latexify(b)]) elif h == '.pagebreak': return nl(cmd('clearpage', [], [self.latexify(b)])) elif h == 'br': assert a == {} assert b == [] return nl(cmd('newline')) elif h == 'blockquote': with self.inside('blockquote'): return blockquote(self.latexify(b)) elif (h == 'footer' and b == [Seq['cite', :]] and self.am_inside('blockquote')): return nl(cmd('attrib', [], [self.latexify(b[0][2])])) elif node == ('CMD', {'class': ['$']}, b): return join('$', b[0], '$') elif node == ('CMD', { 'class': [Var('CITE', CITE_REX.match)] }, b): return self.munge_cite(node, b) elif node == ('CMD', {'class': ['tex']}, b): return b[0] elif h in ('CMD', 'LIT'): return self.bad_command(*node) elif h == 'pre': return highlight.as_latex(node) elif h == 'wbr': return '{}' else: #FIXME(alexander): set 1 as error-code? log.error('Unexpected tag: %s %r %r', h, a, b) return join("")
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean, bibliography): # pylint: disable=R0911,R0914,R0912,R0913,R0915 # FIXME(alexander): clean this up a bit, and get rid of pylint muffles if isinstance(fragment, basestring): return cgi.escape(fragment) (tag, attrs, content) = fragment if tag in ['script', 'style'] and content: content_str, = content return NOT_INLINE_TEMPLATE % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=_indent( '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')), indent)) if tag == 'pre': return '\n' + highlight.as_html(fragment) # special case figures and tables if tag == 'figure': style = attrs['style'].copy() width = style.pop('width', '100%') attrs = dict(attrs.items(), style=style) # FIXME(alexander): dirty hacks to fixup caption & width img = content[-1] assert img[0] == 'img' img[1].setdefault('style', OrderedDict())['width'] = width # put figcaption towards end if content[0][0] == 'figcaption': content[0], content[-1] = content[-1], content[0] if style['display'] == 'inline': ATTRS = Var('ATTRS') # pylint: disable=C0103 assert content[:1] == [('img', ATTRS, [])], \ "figure does not begin with an img" attrs = add_class(ATTRS.val, 'margin') # peel of the figure tag for inlined stuff # as a hack to make epub/html validate # (figures can't occur in all contexts imgs can) return handle_fragments([('img', attrs, [])], bibliography=bibliography, indent=indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean) elif tag == 'table': colgroups = [el for el in content if el[0] == 'colgroup'] COLS = Var("COLS") # pylint: disable=C0103 assert colgroups == [('colgroup', {}, COLS)], \ "Expected single colgroup in table %s" % content # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away # imperatively propagate table cell alignment down # this is a pretty horrible hack and would blow # up nastily if there is attribute aliasing, # but deepcopying should kinda make it work content = copy.deepcopy(content) _propagate_alignment(content, COLS.val) elif tag == 'col': if not epub_clean: attrs = attrs.copy() attrs['width'] = attrs['style']['width'] del attrs['style'] # cull ## return handle_fragments(content, indent) # FIXME(alexander): might make more sense to filter (or h-ify) these out # elsewhere, but for now this seems not unreasonable elif tag == 'title': tag = 'h1' attrs = add_class(attrs, 'title') elif tag == 'subtitle': tag = 'h2' attrs = add_class(attrs, 'subtitle') elif tag in ('CMD', 'LIT'): bad_command = None cmd_type, = attrs['class'] # FIXME(alexander): convert tex to html for non-math; # convert tex math to MML for epub if cmd_type in ('$', 'tex'): tex, = content if cmd_type == '$': tex = r'\(%s\)' % tex return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex) elif CITE_REX.match(cmd_type): if bibliography: bibliography.cited.add(content[0]) # post = ('[%s]' % content[1] if len(content) > 1 and content[1] # else '') # Post is ignored for the moment return _format_citation(cmd_type, content[0], bibliography) else: docerror.docproblem( 'Citation exists, but bibliography is missing') else: bad_command = cmd_type + (':' if content else '') docerror.docproblem('Unknown command type:%s' % cmd_type) elif epub_clean: if tag == 'a' and 'name' in attrs: assert len(attrs) == 1 attrs = {'id': attrs['name']} elif tag == 'img': attrs = {k: attrs[k] for k in attrs if k not in ('width', 'height')} # FIXME(alexander): support continued-list properly in html, by keeping # track of numbers of items per list-id and translating it to start if tag in H_TAGS: if h_shift: tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift)) # generic [tagname].class tags if '.' in tag: if tag == '.pagebreak': tag = 'div.pagebreak' # for whitespace sanitization tagname, classname = tag.split('.', 1) tag = tagname or 'span' attrs = add_class(attrs, classname) if tag == 'CMD' and bad_command: tag = 'span' attrs = {'class': ['bad-command']} content = [('u', {}, [bad_command])] + content elif tag == 'ERR': tag = 'span' attrs = {'class': ['err'], 'title': attrs['info'][0]} content_str = handle_fragments(content, indent=' ' + indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean, bibliography=bibliography) if tag in VOID_TAGS: assert not content template = "<%(tag)s%(attrs_str)s/>" elif tag in INLINE: template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>" elif '\n' in content_str: template = NOT_INLINE_TEMPLATE else: template = COMPACT_NOT_INLINE_TEMPLATE # FIXME(alexander): disgusting hack; fix this properly and # use a set representation to start with! classes = attrs.get('class') if classes: attrs = attrs.copy() attrs['class'] = sorted(set(classes)) return template % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=content_str)