Пример #1
0
def test_several_lines_list():
    """tests list definition on several lines"""
    s = """['a'
    ]"""
    tokens = parse_source(s)
    assert tokens[:4] == [Token(P, LSQB, None), Token(P, STRING, "'a'"),
                          Token(P, RSQB, None), Token(P, NEWLINE, '')]
Пример #2
0
 def test_S_first_set(self):
     p = self.parser
     LOW = p.tokens['LOW']
     CAP = p.tokens['CAP']
     for s in  [Token(p, LOW, 'low'), p.EmptyToken, Token(p, CAP, 'cap')]:
         assert s in self.A.first_set
         assert s in self.B.first_set
         assert s in self.C.first_set
Пример #3
0
def test_numbers():
    """make sure all kind of numbers are correctly parsed"""
    for number in NUMBERS:
        assert parse_source(number)[0] == Token(P, NUMBER, number)
        neg = '-%s' % number
        assert parse_source(neg)[:2] == [Token(P, MINUS, None), 
                                         Token(P, NUMBER, number)]
    for number in BAD_NUMBERS:
        assert parse_source(number)[0] != Token(P, NUMBER, number)
Пример #4
0
def test_hex_number():
    """basic pasrse"""
    tokens = parse_source("a = 0x12L")
    assert listeq(tokens[:4], [
        Token(P, NAME, 'a'),
        Token(P, EQUAL, None),
        Token(P, NUMBER, '0x12L'),
        Token(P, NEWLINE, '')
    ])
Пример #5
0
 def test_S_first_set(self):
     p = self.parser
     LOW = p.tokens['LOW']
     CAP = p.tokens['CAP']
     assert self.A.emptytoken_in_first_set
     assert self.B.emptytoken_in_first_set
     assert self.C.emptytoken_in_first_set
     for s in  [Token(p, LOW, 'low'), Token(p, CAP, 'cap')]:
         assert self.A.match_first_set(s)
         assert self.B.match_first_set(s)
         assert self.C.match_first_set(s)
Пример #6
0
def test_numbers():
    """make sure all kind of numbers are correctly parsed"""
    for number in NUMBERS:
        assert parse_source(number)[0].eq(Token(P, NUMBER, number))
        neg = '-%s' % number
        assert listeq(
            parse_source(neg)[:2],
            [Token(P, MINUS, None),
             Token(P, NUMBER, number)])
    for number in BAD_NUMBERS:
        assert not parse_source(number)[0].eq(Token(P, NUMBER, number))
Пример #7
0
def parse_source(source):
    """returns list of parsed tokens"""
    lexer = Source( P, source.splitlines(True), {})
    tokens = []
    last_token = Token( P, NULLTOKEN, None)
    while last_token.codename != ENDMARKER:
        last_token = lexer.next()
        tokens.append(last_token)
    return tokens
Пример #8
0
def generate_tokens( parser, lines, flags, keywords):
    """
    This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
    the original function is not RPYTHON (uses yield)
    It was also slightly modified to generate Token instances instead
    of the original 5-tuples -- it's now a 4-tuple of
    
    * the Token instance
    * the whole line as a string
    * the line number (the real one, counting continuation lines)
    * the position on the line of the end of the token.

    Original docstring ::
    
        The generate_tokens() generator requires one argment, readline, which
        must be a callable object which provides the same interface as the
        readline() method of built-in file objects. Each call to the function
        should return one line of input as a string.

        The generator produces 5-tuples with these members: the token type; the
        token string; a 2-tuple (srow, scol) of ints specifying the row and
        column where the token begins in the source; a 2-tuple (erow, ecol) of
        ints specifying the row and column where the token ends in the source;
        and the line on which the token was found. The line passed is the
        logical line; continuation lines are included.
    """
    #for line in lines:
    #    print repr(line)
    #print '------------------- flags=%s ---->' % flags
    assert isinstance( parser, Parser )
    token_list = []
    lnum = parenlev = continued = 0
    namechars = NAMECHARS
    numchars = NUMCHARS
    contstr, needcont = '', 0
    contline = None
    indents = [0]
    last_comment = ''
    # make the annotator happy
    pos = -1
    lines.append('') # XXX HACK probably not needed

    # look for the bom (byte-order marker) for utf-8

    # make the annotator happy
    endDFA = automata.DFA([], [])
    # make the annotator happy
    line = ''
    for line in lines:
        lnum = lnum + 1
        pos, max = 0, len(line)

        if contstr:                            # continued string
            if not line:
                raise TokenError("EOF while scanning triple-quoted string", line,
                                 (lnum-1, 0), token_list)
            endmatch = endDFA.recognize(line)
            if endmatch >= 0:
                pos = end = endmatch
                tok = Token(parser, parser.tokens['STRING'], contstr + line[:end])
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
                # token_list.append((STRING, contstr + line[:end],
                #                    strstart, (lnum, end), contline + line))
                contstr, needcont = '', 0
                contline = None
            elif (needcont and not line.endswith('\\\n') and
                               not line.endswith('\\\r\n')):
                tok = Token(parser, parser.tokens['ERRORTOKEN'], contstr + line)
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
                # token_list.append((ERRORTOKEN, contstr + line,
                #                    strstart, (lnum, len(line)), contline))
                contstr = ''
                contline = None
                continue
            else:
                contstr = contstr + line
                contline = contline + line
                continue

        elif parenlev == 0 and not continued:  # new statement
            if not line: break
            column = 0
            while pos < max:                   # measure leading whitespace
                if line[pos] == ' ': column = column + 1
                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
                elif line[pos] == '\f': column = 0
                else: break
                pos = pos + 1
            if pos == max: break

            if line[pos] in '#\r\n':
                # skip comments or blank lines
                continue

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
                tok = Token(parser, parser.tokens['INDENT'], line[:pos])
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
            while column < indents[-1]:
                indents = indents[:-1]
                tok = Token(parser, parser.tokens['DEDENT'], '')
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
            if column != indents[-1]:
                raise TokenIndentationError("unindent does not match any outer indentation level",
                                 line, (lnum, 0), token_list)

        else:                                  # continued statement
            if not line:
                raise TokenError("EOF in multi-line statement", line,
                                 (lnum, 0), token_list)
            continued = 0

        while pos < max:
            pseudomatch = pseudoDFA.recognize(line, pos)
            if pseudomatch >= 0:                            # scan for tokens
                # JDR: Modified
                start = whiteSpaceDFA.recognize(line, pos)
                if start < 0:
                    start = pos
                end = pseudomatch

                if start == end:
                    # Nothing matched!!!
                    raise TokenError("Unknown character", line,
                                 (lnum, start), token_list)

                pos = end
                token, initial = line[start:end], line[start]
                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
                    tok = Token(parser, parser.tokens['NUMBER'], token)
                    token_list.append((tok, line, lnum, pos))
                    last_comment = ''
                elif initial in '\r\n':
                    if parenlev <= 0:
                        tok = Token(parser, parser.tokens['NEWLINE'], token)
                        # XXX YUCK !
                        tok.value = last_comment
                        token_list.append((tok, line, lnum, pos))
                    last_comment = ''
                elif initial == '#':
                    # skip comment
                    last_comment = token
                elif token in triple_quoted:
                    endDFA = endDFAs[token]
                    endmatch = endDFA.recognize(line, pos)
                    if endmatch >= 0:                     # all on one line
                        pos = endmatch
                        token = line[start:pos]
                        tok = Token(parser, parser.tokens['STRING'], token)
                        token_list.append((tok, line, lnum, pos))
                        last_comment = ''
                    else:
                        contstr = line[start:]
                        contline = line
                        break
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
                    if token[-1] == '\n':                  # continued string
                        endDFA = (endDFAs[initial] or endDFAs[token[1]] or
                                   endDFAs[token[2]])
                        contstr, needcont = line[start:], 1
                        contline = line
                        break
                    else:                                  # ordinary string
                        tok = Token(parser, parser.tokens['STRING'], token)
                        token_list.append((tok, line, lnum, pos))
                        last_comment = ''
                elif initial in namechars:                 # ordinary name
                    tok = Token(parser, parser.tokens['NAME'], token)
                    if token not in keywords:
                        tok.isKeyword = False
                    token_list.append((tok, line, lnum, pos))
                    last_comment = ''
                elif initial == '\\':                      # continued stmt
                    continued = 1
                    # lnum -= 1  disabled: count continuation lines separately
                else:
                    if initial in '([{':
                        parenlev = parenlev + 1
                    elif initial in ')]}':
                        parenlev = parenlev - 1
                        if parenlev < 0:
                            raise TokenError("unmatched '%s'" % initial, line,
                                             (lnum-1, 0), token_list)
                    if token in parser.tok_values:
                        punct = parser.tok_values[token]
                        tok = Token(parser, punct, None)
                    else:
                        tok = Token(parser, parser.tokens['OP'], token)
                    token_list.append((tok, line, lnum, pos)) 
                    last_comment = ''
            else:
                start = whiteSpaceDFA.recognize(line, pos)
                if start < 0:
                    start = pos
                if start<max and line[start] in single_quoted:
                    raise TokenError("EOL while scanning single-quoted string", line,
                             (lnum, start), token_list)
                tok = Token(parser, parser.tokens['ERRORTOKEN'], line[pos])
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
                pos = pos + 1

    lnum -= 1
    if not (flags & PyCF_DONT_IMPLY_DEDENT):
        if token_list and token_list[-1][0].codename != parser.tokens['NEWLINE']:
            token_list.append((Token(parser, parser.tokens['NEWLINE'], ''), '\n', lnum, 0))
        for indent in indents[1:]:                # pop remaining indent levels
            tok = Token(parser, parser.tokens['DEDENT'], '')
            token_list.append((tok, line, lnum, pos))
    #if token_list and token_list[-1][0].codename != pytoken.NEWLINE:
    token_list.append((Token(parser, parser.tokens['NEWLINE'], ''), '\n', lnum, 0))

    tok = Token(parser, parser.tokens['ENDMARKER'], '',)
    token_list.append((tok, line, lnum, pos))
    #for t in token_list:
    #    print '%20s  %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2])
    #print '----------------------------------------- pyparser/pythonlexer.py'
    return token_list
Пример #9
0
def generate_tokens(parser, lines, flags, keywords):
    """
    This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
    the original function is not RPYTHON (uses yield)
    It was also slightly modified to generate Token instances instead
    of the original 5-tuples -- it's now a 4-tuple of
    
    * the Token instance
    * the whole line as a string
    * the line number (the real one, counting continuation lines)
    * the position on the line of the end of the token.

    Original docstring ::
    
        The generate_tokens() generator requires one argment, readline, which
        must be a callable object which provides the same interface as the
        readline() method of built-in file objects. Each call to the function
        should return one line of input as a string.

        The generator produces 5-tuples with these members: the token type; the
        token string; a 2-tuple (srow, scol) of ints specifying the row and
        column where the token begins in the source; a 2-tuple (erow, ecol) of
        ints specifying the row and column where the token ends in the source;
        and the line on which the token was found. The line passed is the
        logical line; continuation lines are included.
    """
    #for line in lines:
    #    print repr(line)
    #print '------------------- flags=%s ---->' % flags
    assert isinstance(parser, Parser)
    token_list = []
    lnum = parenlev = continued = 0
    namechars = NAMECHARS
    numchars = NUMCHARS
    contstr, needcont = '', 0
    contline = None
    indents = [0]
    last_comment = ''
    # make the annotator happy
    pos = -1
    lines.append('')  # XXX HACK probably not needed

    # look for the bom (byte-order marker) for utf-8

    # make the annotator happy
    endDFA = automata.DFA([], [])
    # make the annotator happy
    line = ''
    for line in lines:
        lnum = lnum + 1
        pos, max = 0, len(line)

        if contstr:  # continued string
            if not line:
                raise TokenError("EOF while scanning triple-quoted string",
                                 line, (lnum - 1, 0), token_list)
            endmatch = endDFA.recognize(line)
            if endmatch >= 0:
                pos = end = endmatch
                tok = Token(parser, parser.tokens['STRING'],
                            contstr + line[:end])
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
                # token_list.append((STRING, contstr + line[:end],
                #                    strstart, (lnum, end), contline + line))
                contstr, needcont = '', 0
                contline = None
            elif (needcont and not line.endswith('\\\n')
                  and not line.endswith('\\\r\n')):
                tok = Token(parser, parser.tokens['ERRORTOKEN'],
                            contstr + line)
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
                # token_list.append((ERRORTOKEN, contstr + line,
                #                    strstart, (lnum, len(line)), contline))
                contstr = ''
                contline = None
                continue
            else:
                contstr = contstr + line
                contline = contline + line
                continue

        elif parenlev == 0 and not continued:  # new statement
            if not line: break
            column = 0
            while pos < max:  # measure leading whitespace
                if line[pos] == ' ': column = column + 1
                elif line[pos] == '\t':
                    column = (column / tabsize + 1) * tabsize
                elif line[pos] == '\f':
                    column = 0
                else:
                    break
                pos = pos + 1
            if pos == max: break

            if line[pos] in '#\r\n':
                # skip comments or blank lines
                continue

            if column > indents[-1]:  # count indents or dedents
                indents.append(column)
                tok = Token(parser, parser.tokens['INDENT'], line[:pos])
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
            while column < indents[-1]:
                indents = indents[:-1]
                tok = Token(parser, parser.tokens['DEDENT'], '')
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
        else:  # continued statement
            if not line:
                raise TokenError("EOF in multi-line statement", line,
                                 (lnum, 0), token_list)
            continued = 0

        while pos < max:
            pseudomatch = pseudoDFA.recognize(line, pos)
            if pseudomatch >= 0:  # scan for tokens
                # JDR: Modified
                start = whiteSpaceDFA.recognize(line, pos)
                if start < 0:
                    start = pos
                end = pseudomatch

                if start == end:
                    # Nothing matched!!!
                    raise TokenError("Unknown character", line, (lnum, start),
                                     token_list)

                pos = end
                token, initial = line[start:end], line[start]
                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
                    tok = Token(parser, parser.tokens['NUMBER'], token)
                    token_list.append((tok, line, lnum, pos))
                    last_comment = ''
                elif initial in '\r\n':
                    if parenlev <= 0:
                        tok = Token(parser, parser.tokens['NEWLINE'], token)
                        # XXX YUCK !
                        tok.value = last_comment
                        token_list.append((tok, line, lnum, pos))
                    last_comment = ''
                elif initial == '#':
                    # skip comment
                    last_comment = token
                elif token in triple_quoted:
                    endDFA = endDFAs[token]
                    endmatch = endDFA.recognize(line, pos)
                    if endmatch >= 0:  # all on one line
                        pos = endmatch
                        token = line[start:pos]
                        tok = Token(parser, parser.tokens['STRING'], token)
                        token_list.append((tok, line, lnum, pos))
                        last_comment = ''
                    else:
                        contstr = line[start:]
                        contline = line
                        break
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
                    if token[-1] == '\n':  # continued string
                        endDFA = (endDFAs[initial] or endDFAs[token[1]]
                                  or endDFAs[token[2]])
                        contstr, needcont = line[start:], 1
                        contline = line
                        break
                    else:  # ordinary string
                        tok = Token(parser, parser.tokens['STRING'], token)
                        token_list.append((tok, line, lnum, pos))
                        last_comment = ''
                elif initial in namechars:  # ordinary name
                    tok = Token(parser, parser.tokens['NAME'], token)
                    if token not in keywords:
                        tok.isKeyword = False
                    token_list.append((tok, line, lnum, pos))
                    last_comment = ''
                elif initial == '\\':  # continued stmt
                    continued = 1
                    # lnum -= 1  disabled: count continuation lines separately
                else:
                    if initial in '([{':
                        parenlev = parenlev + 1
                    elif initial in ')]}':
                        parenlev = parenlev - 1
                        if parenlev < 0:
                            raise TokenError("unmatched '%s'" % initial, line,
                                             (lnum - 1, 0), token_list)
                    if token in parser.tok_values:
                        punct = parser.tok_values[token]
                        tok = Token(parser, punct, None)
                    else:
                        tok = Token(parser, parser.tokens['OP'], token)
                    token_list.append((tok, line, lnum, pos))
                    last_comment = ''
            else:
                start = whiteSpaceDFA.recognize(line, pos)
                if start < 0:
                    start = pos
                if start < max and line[start] in single_quoted:
                    raise TokenError("EOL while scanning single-quoted string",
                                     line, (lnum, start), token_list)
                tok = Token(parser, parser.tokens['ERRORTOKEN'], line[pos])
                token_list.append((tok, line, lnum, pos))
                last_comment = ''
                pos = pos + 1

    lnum -= 1
    if not (flags & PyCF_DONT_IMPLY_DEDENT):
        if token_list and token_list[-1][0].codename != parser.tokens[
                'NEWLINE']:
            token_list.append((Token(parser, parser.tokens['NEWLINE'],
                                     ''), '\n', lnum, 0))
        for indent in indents[1:]:  # pop remaining indent levels
            tok = Token(parser, parser.tokens['DEDENT'], '')
            token_list.append((tok, line, lnum, pos))
    #if token_list and token_list[-1][0].codename != pytoken.NEWLINE:
    token_list.append((Token(parser, parser.tokens['NEWLINE'],
                             ''), '\n', lnum, 0))

    tok = Token(
        parser,
        parser.tokens['ENDMARKER'],
        '',
    )
    token_list.append((tok, line, lnum, pos))
    #for t in token_list:
    #    print '%20s  %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2])
    #print '----------------------------------------- pyparser/pythonlexer.py'
    return token_list