Python ValidTokens示例，lazylex.html.ValidTokens Python示例

示例#1

0

显示文件

def IndexGroupToText(group_text):
  """
  Note: We cold process some tags, like:

  - Blue Link (not clickable, but still useful)
  - Red X
  """
  f = cStringIO.StringIO()
  out = html.Output(group_text, f)

  pos = 0
  for tok_id, end_pos in html.ValidTokens(group_text):
    if tok_id == html.RawData:
      out.SkipTo(pos)
      out.PrintUntil(end_pos)

    elif tok_id == html.CharEntity:  # &amp;

      entity = group_text[pos+1 : end_pos-1]

      out.SkipTo(pos)
      out.Print(HTML_REFS[entity])
      out.SkipTo(end_pos)

    # Not handling these yet
    elif tok_id == html.HexChar:
      raise AssertionError('Hex Char %r' % group_text[pos : pos + 20])

    elif tok_id == html.DecChar:
      raise AssertionError('Dec Char %r' % group_text[pos : pos + 20])

    pos = end_pos

  out.PrintTheRest()
  return f.getvalue()

示例#2

0

显示文件

文件： make_help.py 项目： sailfish009/oil

def ExtractBody(s):
    """Extract what's in between <body></body>

  The splitter needs balanced tags, and what's in <head> isn't balanced.
  """
    f = cStringIO.StringIO()
    out = html.Output(s, f)
    tag_lexer = html.TagLexer(s)

    pos = 0
    it = html.ValidTokens(s)
    while True:
        try:
            tok_id, end_pos = next(it)
        except StopIteration:
            break

        if tok_id == html.StartTag:
            tag_lexer.Reset(pos, end_pos)
            if tag_lexer.TagName() == 'body':
                body_start_right = end_pos  # right after <body>

                out.SkipTo(body_start_right)
                body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')

                out.PrintUntil(body_end_left)
                break

        pos = end_pos

    return f.getvalue()

示例#3

0

显示文件

文件： oil_doc.py 项目： asokoloski/oil

def ExpandLinks(s):
    """
  Expand $xref:bash and so forth
  """
    f = cStringIO.StringIO()
    out = html.Output(s, f)

    tag_lexer = html.TagLexer(s)

    pos = 0

    it = html.ValidTokens(s)
    while True:
        try:
            tok_id, end_pos = next(it)
        except StopIteration:
            break

        if tok_id == html.StartTag:

            tag_lexer.Reset(pos, end_pos)
            if tag_lexer.TagName() == 'a':
                open_tag_right = end_pos

                href_start, href_end = tag_lexer.GetSpanForAttrValue('href')
                if href_start == -1:
                    continue

                # TODO: Need to unescape like GetAttr()
                href = s[href_start:href_end]

                new = None
                m = _SHORTCUT_RE.match(href)
                if m:
                    abbrev_name, arg = m.groups()
                    if not arg:
                        close_tag_left, _ = html.ReadUntilEndTag(
                            it, tag_lexer, 'a')
                        arg = s[open_tag_right:close_tag_left]

                    func = _ABBREVIATIONS.get(abbrev_name)
                    if not func:
                        raise RuntimeError('Invalid abbreviation %r' %
                                           abbrev_name)
                    new = func(arg)

                if new is not None:
                    out.PrintUntil(href_start)
                    f.write(cgi.escape(new))
                    out.SkipTo(href_end)

        pos = end_pos

    out.PrintTheRest()

    return f.getvalue()

示例#4

0

显示文件

文件： make_help.py 项目： sailfish009/oil

def HelpTopics(s):
    """
  Given an HTML page, yield groups (id, desc, block of text)
  """
    tag_lexer = html.TagLexer(s)

    pos = 0
    it = html.ValidTokens(s)
    while True:
        try:
            tok_id, end_pos = next(it)
        except StopIteration:
            break

        if tok_id == html.StartTag:
            tag_lexer.Reset(pos, end_pos)
            #log('%r', tag_lexer.TagString())
            #log('%r', tag_lexer.TagName())

            # Capture <h2 id="foo"> first
            if tag_lexer.TagName() == 'h2':
                h2_start_right = end_pos

                open_tag_right = end_pos
                group_topic_id = tag_lexer.GetAttr('id')
                assert group_topic_id, 'Expected id= in %r' % tag_lexer.TagString(
                )

                h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')

                anchor_html = s[h2_start_right:h2_end_left]
                paren_pos = anchor_html.find('(')
                assert paren_pos != -1, anchor_html

                group_name = anchor_html[:paren_pos].strip()

                # Now find the <code></code> span
                _, code_start_right = html.ReadUntilStartTag(
                    it, tag_lexer, 'code')
                css_class = tag_lexer.GetAttr('class')
                assert css_class in (
                    'language-oil-help-topics',
                    'language-osh-help-topics'), tag_lexer.TagString()

                code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')

                text = html.ToText(s, code_start_right, code_end_left)
                yield group_topic_id, group_name, text

        pos = end_pos

示例#5

0

显示文件

文件： oil_doc.py 项目： drkameleon/oil

def RemoveComments(s):
    """ Remove <!-- comments --> """
    f = cStringIO.StringIO()
    out = html.Output(s, f)

    tag_lexer = html.TagLexer(s)

    pos = 0

    for tok_id, end_pos in html.ValidTokens(s):
        if tok_id == html.Comment:
            value = s[pos:end_pos]
            # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
            if 'REPLACE' not in value:
                out.PrintUntil(pos)
                out.SkipTo(end_pos)
        pos = end_pos

    out.PrintTheRest()
    return f.getvalue()

示例#6

0

显示文件

文件： oil_doc.py 项目： drkameleon/oil

def HighlightCode(s, default_highlighter):
    """
  Algorithm:
  1. Collect what's inside <pre><code> ...
  2. Then read lines with ShPromptPlugin.
  3. If the line looks like a shell prompt and command, highlight them with
     <span>
  """
    f = cStringIO.StringIO()
    out = html.Output(s, f)

    tag_lexer = html.TagLexer(s)

    pos = 0

    it = html.ValidTokens(s)

    while True:
        try:
            tok_id, end_pos = next(it)
        except StopIteration:
            break

        if tok_id == html.StartTag:

            tag_lexer.Reset(pos, end_pos)
            if tag_lexer.TagName() == 'pre':
                pre_start_pos = pos
                pos = end_pos

                try:
                    tok_id, end_pos = next(it)
                except StopIteration:
                    break

                tag_lexer.Reset(pos, end_pos)
                if tok_id == html.StartTag and tag_lexer.TagName() == 'code':

                    css_class = tag_lexer.GetAttr('class')
                    code_start_pos = end_pos

                    if css_class is None:
                        slash_code_left, slash_code_right = \
                            html.ReadUntilEndTag(it, tag_lexer, 'code')

                        if default_highlighter is not None:
                            if default_highlighter == 'oil-sh':
                                out.PrintUntil(code_start_pos)

                                # Using ShPromptPlugin because it does the comment highlighting we want!
                                plugin = ShPromptPlugin(
                                    s, code_start_pos, slash_code_left)
                                plugin.PrintHighlighted(out)

                                out.SkipTo(slash_code_left)
                            else:
                                raise RuntimeError(
                                    'Unknown default highlighter %r' %
                                    default_highlighter)

                    elif css_class.startswith('language'):
                        slash_code_left, slash_code_right = \
                            html.ReadUntilEndTag(it, tag_lexer, 'code')

                        if css_class == 'language-sh-prompt':
                            # Here's we're KEEPING the original <pre><code>
                            # Print everything up to and including <pre><code language="...">
                            out.PrintUntil(code_start_pos)

                            plugin = ShPromptPlugin(s, code_start_pos,
                                                    slash_code_left)
                            plugin.PrintHighlighted(out)

                            out.SkipTo(slash_code_left)

                        elif css_class == 'language-oil':
                            # TODO: Write an Oil syntax highlighter.
                            pass

                        elif css_class == 'language-osh-help-topics':
                            # TODO: Link to osh-help.html, instead of oil-help.html
                            out.PrintUntil(code_start_pos)

                            plugin = HelpTopicsPlugin(s, code_start_pos,
                                                      slash_code_left, 'osh')
                            plugin.PrintHighlighted(out)

                            out.SkipTo(slash_code_left)

                        elif css_class == 'language-oil-help-topics':

                            out.PrintUntil(code_start_pos)

                            plugin = HelpTopicsPlugin(s, code_start_pos,
                                                      slash_code_left, 'oil')
                            plugin.PrintHighlighted(out)

                            out.SkipTo(slash_code_left)

                        else:  # language-*: Use Pygments

                            # We REMOVIE the original <pre><code> because Pygments gives you a <pre> already

                            # We just read closing </code>, and the next one should be </pre>.
                            try:
                                tok_id, end_pos = next(it)
                            except StopIteration:
                                break
                            tag_lexer.Reset(slash_code_right, end_pos)
                            assert tok_id == html.EndTag, tok_id
                            assert tag_lexer.TagName(
                            ) == 'pre', tag_lexer.TagName()
                            slash_pre_right = end_pos

                            out.PrintUntil(pre_start_pos)

                            lang = css_class[len('language-'):]
                            plugin = PygmentsPlugin(s, code_start_pos,
                                                    slash_code_left, lang)
                            plugin.PrintHighlighted(out)

                            out.SkipTo(slash_pre_right)
                            f.write('<!-- done pygments -->\n')

        pos = end_pos

    out.PrintTheRest()

    return f.getvalue()