def IndexGroupToText(group_text): """ Note: We cold process some tags, like: - Blue Link (not clickable, but still useful) - Red X """ f = cStringIO.StringIO() out = html.Output(group_text, f) pos = 0 for tok_id, end_pos in html.ValidTokens(group_text): if tok_id == html.RawData: out.SkipTo(pos) out.PrintUntil(end_pos) elif tok_id == html.CharEntity: # & entity = group_text[pos+1 : end_pos-1] out.SkipTo(pos) out.Print(HTML_REFS[entity]) out.SkipTo(end_pos) # Not handling these yet elif tok_id == html.HexChar: raise AssertionError('Hex Char %r' % group_text[pos : pos + 20]) elif tok_id == html.DecChar: raise AssertionError('Dec Char %r' % group_text[pos : pos + 20]) pos = end_pos out.PrintTheRest() return f.getvalue()
def ExtractBody(s): """Extract what's in between <body></body> The splitter needs balanced tags, and what's in <head> isn't balanced. """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'body': body_start_right = end_pos # right after <body> out.SkipTo(body_start_right) body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body') out.PrintUntil(body_end_left) break pos = end_pos return f.getvalue()
def ExpandLinks(s): """ Expand $xref:bash and so forth """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'a': open_tag_right = end_pos href_start, href_end = tag_lexer.GetSpanForAttrValue('href') if href_start == -1: continue # TODO: Need to unescape like GetAttr() href = s[href_start:href_end] new = None m = _SHORTCUT_RE.match(href) if m: abbrev_name, arg = m.groups() if not arg: close_tag_left, _ = html.ReadUntilEndTag( it, tag_lexer, 'a') arg = s[open_tag_right:close_tag_left] func = _ABBREVIATIONS.get(abbrev_name) if not func: raise RuntimeError('Invalid abbreviation %r' % abbrev_name) new = func(arg) if new is not None: out.PrintUntil(href_start) f.write(cgi.escape(new)) out.SkipTo(href_end) pos = end_pos out.PrintTheRest() return f.getvalue()
def testShPrompt(self): r = oil_doc._PROMPT_LINE_RE line = 'oil$ ls -l<TAB> # comment' m = r.match(line) print(m.groups()) print(m.group(2)) print(m.end(2)) plugin = oil_doc.ShPromptPlugin(line, 0, len(line)) out = html.Output(line, sys.stdout) plugin.PrintHighlighted(out)
def RemoveComments(s): """ Remove <!-- comments --> """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 for tok_id, end_pos in html.ValidTokens(s): if tok_id == html.Comment: value = s[pos:end_pos] # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc. if 'REPLACE' not in value: out.PrintUntil(pos) out.SkipTo(end_pos) pos = end_pos out.PrintTheRest() return f.getvalue()
def HighlightLine(line): """Convert a line of text to HTML. Topics are highlighted and X made red. Args: line: RAW SPAN of HTML that is already escaped. Returns: The HTML with some tags inserted. """ f = cStringIO.StringIO() out = html.Output(line, f) pos = 0 # position within line if line.startswith('X '): out.Print(X_LEFT_SPAN) out.PrintUntil(2) out.Print('</span>') pos = 2 elif line.startswith(' '): pos = 2 else: return line # Highlight [Section] at the start of a line. m = SECTION_RE.match(line, pos) if m: href = _StringToHref(m.group(1)) out.PrintUntil(m.start(1)) out.Print('<a href="help.html#%s" class="level2">' % href) out.PrintUntil(m.end(1)) # anchor out.Print('</a>') pos = m.end(0) # ADVANCE _WHITESPACE = re.compile(r'[ ]+') m = _WHITESPACE.match(line, pos) assert m, 'Expected whitespace %r' % line pos = m.end(0) done = False while not done: # Now just match one m = TOPIC_RE.match(line, pos) if not m or m.group(2) in _NOT_A_TOPIC: break if m.group(1): out.PrintUntil(m.start(1)) out.Print(X_LEFT_SPAN) out.PrintUntil(m.end(1)) out.Print('</span>') # The linked topic topic = m.group(2) out.PrintUntil(m.start(2)) out.Print('<a href="help.html#%s">' % topic) out.PrintUntil(m.end(2)) out.Print('</a>') # Trailing 3 spaces required to continue. if not m.group(4): done = True pos = m.end(0) out.PrintTheRest() return f.getvalue()
def HighlightCode(s, default_highlighter): """ Algorithm: 1. Collect what's inside <pre><code> ... 2. Then read lines with ShPromptPlugin. 3. If the line looks like a shell prompt and command, highlight them with <span> """ f = cStringIO.StringIO() out = html.Output(s, f) tag_lexer = html.TagLexer(s) pos = 0 it = html.ValidTokens(s) while True: try: tok_id, end_pos = next(it) except StopIteration: break if tok_id == html.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'pre': pre_start_pos = pos pos = end_pos try: tok_id, end_pos = next(it) except StopIteration: break tag_lexer.Reset(pos, end_pos) if tok_id == html.StartTag and tag_lexer.TagName() == 'code': css_class = tag_lexer.GetAttr('class') code_start_pos = end_pos if css_class is None: slash_code_left, slash_code_right = \ html.ReadUntilEndTag(it, tag_lexer, 'code') if default_highlighter is not None: if default_highlighter == 'oil-sh': out.PrintUntil(code_start_pos) # Using ShPromptPlugin because it does the comment highlighting we want! plugin = ShPromptPlugin( s, code_start_pos, slash_code_left) plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) else: raise RuntimeError( 'Unknown default highlighter %r' % default_highlighter) elif css_class.startswith('language'): slash_code_left, slash_code_right = \ html.ReadUntilEndTag(it, tag_lexer, 'code') if css_class == 'language-sh-prompt': # Here's we're KEEPING the original <pre><code> # Print everything up to and including <pre><code language="..."> out.PrintUntil(code_start_pos) plugin = ShPromptPlugin(s, code_start_pos, slash_code_left) plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) elif css_class == 'language-oil': # TODO: Write an Oil syntax highlighter. pass elif css_class == 'language-osh-help-topics': # TODO: Link to osh-help.html, instead of oil-help.html out.PrintUntil(code_start_pos) plugin = HelpTopicsPlugin(s, code_start_pos, slash_code_left, 'osh') plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) elif css_class == 'language-oil-help-topics': out.PrintUntil(code_start_pos) plugin = HelpTopicsPlugin(s, code_start_pos, slash_code_left, 'oil') plugin.PrintHighlighted(out) out.SkipTo(slash_code_left) else: # language-*: Use Pygments # We REMOVIE the original <pre><code> because Pygments gives you a <pre> already # We just read closing </code>, and the next one should be </pre>. try: tok_id, end_pos = next(it) except StopIteration: break tag_lexer.Reset(slash_code_right, end_pos) assert tok_id == html.EndTag, tok_id assert tag_lexer.TagName( ) == 'pre', tag_lexer.TagName() slash_pre_right = end_pos out.PrintUntil(pre_start_pos) lang = css_class[len('language-'):] plugin = PygmentsPlugin(s, code_start_pos, slash_code_left, lang) plugin.PrintHighlighted(out) out.SkipTo(slash_pre_right) f.write('<!-- done pygments -->\n') pos = end_pos out.PrintTheRest() return f.getvalue()