def IndexGroupToText(group_text): """ Note: We cold process some tags, like: - Blue Link (not clickable, but still useful) - Red X """ f = cStringIO.StringIO() out = html.Output(group_text, f) pos = 0 for tok_id, end_pos in html.ValidTokens(group_text): if tok_id == html.RawData: out.SkipTo(pos) out.PrintUntil(end_pos) elif tok_id == html.CharEntity: # & entity = group_text[pos+1 : end_pos-1] out.SkipTo(pos) out.Print(HTML_REFS[entity]) out.SkipTo(end_pos) # Not handling these yet elif tok_id == html.HexChar: raise AssertionError('Hex Char %r' % group_text[pos : pos + 20]) elif tok_id == html.DecChar: raise AssertionError('Dec Char %r' % group_text[pos : pos + 20]) pos = end_pos out.PrintTheRest() return f.getvalue()
def ExtractBody(s): """Extract what's in between <body></body> The splitter needs balanced tags, and what's in <head> isn't balanced. """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'body': body_start_right = end_pos # right after <body> out.SkipTo(body_start_right) body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body') out.PrintUntil(body_end_left) break pos = end_pos return f.getvalue()
def ExpandLinks(s): """ Expand $xref:bash and so forth """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'a': open_tag_right = end_pos href_start, href_end = tag_lexer.GetSpanForAttrValue('href') if href_start == -1: continue # TODO: Need to unescape like GetAttr() href = s[href_start:href_end] new = None m = _SHORTCUT_RE.match(href) if m: abbrev_name, arg = m.groups() if not arg: close_tag_left, _ = html.ReadUntilEndTag( it, tag_lexer, 'a') arg = s[open_tag_right:close_tag_left] func = _ABBREVIATIONS.get(abbrev_name) if not func: raise RuntimeError('Invalid abbreviation %r' % abbrev_name) new = func(arg) if new is not None: out.PrintUntil(href_start) f.write(cgi.escape(new)) out.SkipTo(href_end) pos = end_pos out.PrintTheRest() return f.getvalue()
def HelpTopics(s): """ Given an HTML page, yield groups (id, desc, block of text) """ tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) #log('%r', tag_lexer.TagString()) #log('%r', tag_lexer.TagName()) # Capture <h2 id="foo"> first if tag_lexer.TagName() == 'h2': h2_start_right = end_pos open_tag_right = end_pos group_topic_id = tag_lexer.GetAttr('id') assert group_topic_id, 'Expected id= in %r' % tag_lexer.TagString( ) h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2') anchor_html = s[h2_start_right:h2_end_left] paren_pos = anchor_html.find('(') assert paren_pos != -1, anchor_html group_name = anchor_html[:paren_pos].strip() # Now find the <code></code> span _, code_start_right = html.ReadUntilStartTag( it, tag_lexer, 'code') css_class = tag_lexer.GetAttr('class') assert css_class in ( 'language-oil-help-topics', 'language-osh-help-topics'), tag_lexer.TagString() code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code') text = html.ToText(s, code_start_right, code_end_left) yield group_topic_id, group_name, text pos = end_pos
def RemoveComments(s): """ Remove <!-- comments --> """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 for tok_id, end_pos in html.ValidTokens(s): if tok_id == html.Comment: value = s[pos:end_pos] # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc. if 'REPLACE' not in value: out.PrintUntil(pos) out.SkipTo(end_pos) pos = end_pos out.PrintTheRest() return f.getvalue()
def HighlightCode(s, default_highlighter): """ Algorithm: 1. Collect what's inside <pre><code> ... 2. Then read lines with ShPromptPlugin. 3. If the line looks like a shell prompt and command, highlight them with <span> """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'pre': pre_start_pos = pos pos = end_pos try: tok_id, end_pos = next(it) except StopIteration: break tag_lexer.Reset(pos, end_pos) if tok_id == html.StartTag and tag_lexer.TagName() == 'code': css_class = tag_lexer.GetAttr('class') code_start_pos = end_pos if css_class is None: slash_code_left, slash_code_right = \ html.ReadUntilEndTag(it, tag_lexer, 'code') if default_highlighter is not None: if default_highlighter == 'oil-sh': out.PrintUntil(code_start_pos) # Using ShPromptPlugin because it does the comment highlighting we want! plugin = ShPromptPlugin( s, code_start_pos, slash_code_left) plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) else: raise RuntimeError( 'Unknown default highlighter %r' % default_highlighter) elif css_class.startswith('language'): slash_code_left, slash_code_right = \ html.ReadUntilEndTag(it, tag_lexer, 'code') if css_class == 'language-sh-prompt': # Here's we're KEEPING the original <pre><code> # Print everything up to and including <pre><code language="..."> out.PrintUntil(code_start_pos) plugin = ShPromptPlugin(s, code_start_pos, slash_code_left) plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) elif css_class == 'language-oil': # TODO: Write an Oil syntax highlighter. pass elif css_class == 'language-osh-help-topics': # TODO: Link to osh-help.html, instead of oil-help.html out.PrintUntil(code_start_pos) plugin = HelpTopicsPlugin(s, code_start_pos, slash_code_left, 'osh') plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) elif css_class == 'language-oil-help-topics': out.PrintUntil(code_start_pos) plugin = HelpTopicsPlugin(s, code_start_pos, slash_code_left, 'oil') plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) else: # language-*: Use Pygments # We REMOVIE the original <pre><code> because Pygments gives you a <pre> already # We just read closing </code>, and the next one should be </pre>. try: tok_id, end_pos = next(it) except StopIteration: break tag_lexer.Reset(slash_code_right, end_pos) assert tok_id == html.EndTag, tok_id assert tag_lexer.TagName( ) == 'pre', tag_lexer.TagName() slash_pre_right = end_pos out.PrintUntil(pre_start_pos) lang = css_class[len('language-'):] plugin = PygmentsPlugin(s, code_start_pos, slash_code_left, lang) plugin.PrintHighlighted(out) out.SkipTo(slash_pre_right) f.write('<!-- done pygments -->\n') pos = end_pos out.PrintTheRest() return f.getvalue()