Пример #1
0
    def get_tokens(self, text):
        if isinstance(text, str):
            # raw token stream never has any non-ASCII characters
            text = text.encode('ascii')
        if self.compress == 'gz':
            import gzip
            gzipfile = gzip.GzipFile('', 'rb', 9, io.StringIO(text))
            text = gzipfile.read()
        elif self.compress == 'bz2':
            import bz2
            text = bz2.decompress(text)

        # do not call Lexer.get_tokens() because we do not want Unicode
        # decoding to occur, and stripping is not optional.
        text = text.strip(b('\n')) + b('\n')
        for i, t, v in self.get_tokens_unprocessed(text):
            yield t, v
Пример #2
0
    def get_tokens(self, text):
        if isinstance(text, str):
            # raw token stream never has any non-ASCII characters
            text = text.encode("ascii")
        if self.compress == "gz":
            import gzip

            gzipfile = gzip.GzipFile("", "rb", 9, io.StringIO(text))
            text = gzipfile.read()
        elif self.compress == "bz2":
            import bz2

            text = bz2.decompress(text)

        # do not call Lexer.get_tokens() because we do not want Unicode
        # decoding to occur, and stripping is not optional.
        text = text.strip(b("\n")) + b("\n")
        for i, t, v in self.get_tokens_unprocessed(text):
            yield t, v
Пример #3
0
    def format(self, tokensource, outfile):
        try:
            outfile.write(b(''))
        except TypeError:
            raise TypeError('The raw tokens formatter needs a binary '
                            'output file')
        if self.compress == 'gz':
            import gzip
            outfile = gzip.GzipFile('', 'wb', 9, outfile)

            def write(text):
                outfile.write(text.encode())

            flush = outfile.flush
        elif self.compress == 'bz2':
            import bz2
            compressor = bz2.BZ2Compressor(9)

            def write(text):
                outfile.write(compressor.compress(text.encode()))

            def flush():
                outfile.write(compressor.flush())
                outfile.flush()
        else:

            def write(text):
                outfile.write(text.encode())

            flush = outfile.flush

        lasttype = None
        lastval = ''
        if self.error_color:
            for ttype, value in tokensource:
                line = "%s\t%r\n" % (ttype, value)
                if ttype is Token.Error:
                    write(colorize(self.error_color, line))
                else:
                    write(line)
        else:
            for ttype, value in tokensource:
                write("%s\t%r\n" % (ttype, value))
        flush()
Пример #4
0
 def get_tokens_unprocessed(self, text):
     length = 0
     for match in line_re.finditer(text):
         try:
             ttypestr, val = match.group().split(b("\t"), 1)
         except ValueError:
             val = match.group().decode(self.encoding)
             ttype = Error
         else:
             ttype = _ttype_cache.get(ttypestr)
             if not ttype:
                 ttype = Token
                 ttypes = ttypestr.split(".")[1:]
                 for ttype_ in ttypes:
                     if not ttype_ or not ttype_[0].isupper():
                         raise ValueError("malformed token name")
                     ttype = getattr(ttype, ttype_)
                 _ttype_cache[ttypestr] = ttype
             val = val[2:-2].decode("unicode-escape")
         yield length, ttype, val
         length += len(val)
Пример #5
0
 def get_tokens_unprocessed(self, text):
     length = 0
     for match in line_re.finditer(text):
         try:
             ttypestr, val = match.group().split(b('\t'), 1)
         except ValueError:
             val = match.group().decode(self.encoding)
             ttype = Error
         else:
             ttype = _ttype_cache.get(ttypestr)
             if not ttype:
                 ttype = Token
                 ttypes = ttypestr.split('.')[1:]
                 for ttype_ in ttypes:
                     if not ttype_ or not ttype_[0].isupper():
                         raise ValueError('malformed token name')
                     ttype = getattr(ttype, ttype_)
                 _ttype_cache[ttypestr] = ttype
             val = val[2:-2].decode('unicode-escape')
         yield length, ttype, val
         length += len(val)
Пример #6
0
    def format(self, tokensource, outfile):
        try:
            outfile.write(b(''))
        except TypeError:
            raise TypeError('The raw tokens formatter needs a binary '
                            'output file')
        if self.compress == 'gz':
            import gzip
            outfile = gzip.GzipFile('', 'wb', 9, outfile)
            def write(text):
                outfile.write(text.encode())
            flush = outfile.flush
        elif self.compress == 'bz2':
            import bz2
            compressor = bz2.BZ2Compressor(9)
            def write(text):
                outfile.write(compressor.compress(text.encode()))
            def flush():
                outfile.write(compressor.flush())
                outfile.flush()
        else:
            def write(text):
                outfile.write(text.encode())
            flush = outfile.flush

        lasttype = None
        lastval = ''
        if self.error_color:
            for ttype, value in tokensource:
                line = "%s\t%r\n" % (ttype, value)
                if ttype is Token.Error:
                    write(colorize(self.error_color, line))
                else:
                    write(line)
        else:
            for ttype, value in tokensource:
                write("%s\t%r\n" % (ttype, value))
        flush()
Пример #7
0
    """
    "Null" lexer, doesn't highlight anything.
    """

    name = "Text only"
    aliases = ["text"]
    filenames = ["*.txt"]
    mimetypes = ["text/plain"]

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile(b(".*?\n"))


class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the `RawTokenFormatter`.  This
    lexer raises exceptions during parsing if the token stream in the
    file is malformed.

    Additional options accepted:

    `compress`
        If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
        the given compression algorithm before lexing (default: ``""``).
    """
Пример #8
0
class TextLexer(Lexer):
    """
    "Null" lexer, doesn't highlight anything.
    """
    name = 'Text only'
    aliases = ['text']
    filenames = ['*.txt']
    mimetypes = ['text/plain']

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile(b('.*?\n'))

class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the `RawTokenFormatter`.  This
    lexer raises exceptions during parsing if the token stream in the
    file is malformed.

    Additional options accepted:

    `compress`
        If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
        the given compression algorithm before lexing (default: ``""``).
    """
    name = 'Raw token data'
    aliases = ['raw']