def test_several_lines_list(): """tests list definition on several lines""" s = """['a' ]""" tokens = parse_source(s) assert tokens[:4] == [Token(P, LSQB, None), Token(P, STRING, "'a'"), Token(P, RSQB, None), Token(P, NEWLINE, '')]
def test_S_first_set(self): p = self.parser LOW = p.tokens['LOW'] CAP = p.tokens['CAP'] for s in [Token(p, LOW, 'low'), p.EmptyToken, Token(p, CAP, 'cap')]: assert s in self.A.first_set assert s in self.B.first_set assert s in self.C.first_set
def test_numbers(): """make sure all kind of numbers are correctly parsed""" for number in NUMBERS: assert parse_source(number)[0] == Token(P, NUMBER, number) neg = '-%s' % number assert parse_source(neg)[:2] == [Token(P, MINUS, None), Token(P, NUMBER, number)] for number in BAD_NUMBERS: assert parse_source(number)[0] != Token(P, NUMBER, number)
def test_hex_number(): """basic pasrse""" tokens = parse_source("a = 0x12L") assert listeq(tokens[:4], [ Token(P, NAME, 'a'), Token(P, EQUAL, None), Token(P, NUMBER, '0x12L'), Token(P, NEWLINE, '') ])
def test_S_first_set(self): p = self.parser LOW = p.tokens['LOW'] CAP = p.tokens['CAP'] assert self.A.emptytoken_in_first_set assert self.B.emptytoken_in_first_set assert self.C.emptytoken_in_first_set for s in [Token(p, LOW, 'low'), Token(p, CAP, 'cap')]: assert self.A.match_first_set(s) assert self.B.match_first_set(s) assert self.C.match_first_set(s)
def test_numbers(): """make sure all kind of numbers are correctly parsed""" for number in NUMBERS: assert parse_source(number)[0].eq(Token(P, NUMBER, number)) neg = '-%s' % number assert listeq( parse_source(neg)[:2], [Token(P, MINUS, None), Token(P, NUMBER, number)]) for number in BAD_NUMBERS: assert not parse_source(number)[0].eq(Token(P, NUMBER, number))
def parse_source(source): """returns list of parsed tokens""" lexer = Source( P, source.splitlines(True), {}) tokens = [] last_token = Token( P, NULLTOKEN, None) while last_token.codename != ENDMARKER: last_token = lexer.next() tokens.append(last_token) return tokens
def generate_tokens( parser, lines, flags, keywords): """ This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since the original function is not RPYTHON (uses yield) It was also slightly modified to generate Token instances instead of the original 5-tuples -- it's now a 4-tuple of * the Token instance * the whole line as a string * the line number (the real one, counting continuation lines) * the position on the line of the end of the token. Original docstring :: The generate_tokens() generator requires one argment, readline, which must be a callable object which provides the same interface as the readline() method of built-in file objects. Each call to the function should return one line of input as a string. The generator produces 5-tuples with these members: the token type; the token string; a 2-tuple (srow, scol) of ints specifying the row and column where the token begins in the source; a 2-tuple (erow, ecol) of ints specifying the row and column where the token ends in the source; and the line on which the token was found. The line passed is the logical line; continuation lines are included. """ #for line in lines: # print repr(line) #print '------------------- flags=%s ---->' % flags assert isinstance( parser, Parser ) token_list = [] lnum = parenlev = continued = 0 namechars = NAMECHARS numchars = NUMCHARS contstr, needcont = '', 0 contline = None indents = [0] last_comment = '' # make the annotator happy pos = -1 lines.append('') # XXX HACK probably not needed # look for the bom (byte-order marker) for utf-8 # make the annotator happy endDFA = automata.DFA([], []) # make the annotator happy line = '' for line in lines: lnum = lnum + 1 pos, max = 0, len(line) if contstr: # continued string if not line: raise TokenError("EOF while scanning triple-quoted string", line, (lnum-1, 0), token_list) endmatch = endDFA.recognize(line) if endmatch >= 0: pos = end = endmatch tok = Token(parser, parser.tokens['STRING'], contstr + line[:end]) token_list.append((tok, line, lnum, pos)) last_comment = '' # token_list.append((STRING, contstr + line[:end], # strstart, (lnum, end), contline + line)) contstr, needcont = '', 0 contline = None elif (needcont and not line.endswith('\\\n') and not line.endswith('\\\r\n')): tok = Token(parser, parser.tokens['ERRORTOKEN'], contstr + line) token_list.append((tok, line, lnum, pos)) last_comment = '' # token_list.append((ERRORTOKEN, contstr + line, # strstart, (lnum, len(line)), contline)) contstr = '' contline = None continue else: contstr = contstr + line contline = contline + line continue elif parenlev == 0 and not continued: # new statement if not line: break column = 0 while pos < max: # measure leading whitespace if line[pos] == ' ': column = column + 1 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize elif line[pos] == '\f': column = 0 else: break pos = pos + 1 if pos == max: break if line[pos] in '#\r\n': # skip comments or blank lines continue if column > indents[-1]: # count indents or dedents indents.append(column) tok = Token(parser, parser.tokens['INDENT'], line[:pos]) token_list.append((tok, line, lnum, pos)) last_comment = '' while column < indents[-1]: indents = indents[:-1] tok = Token(parser, parser.tokens['DEDENT'], '') token_list.append((tok, line, lnum, pos)) last_comment = '' if column != indents[-1]: raise TokenIndentationError("unindent does not match any outer indentation level", line, (lnum, 0), token_list) else: # continued statement if not line: raise TokenError("EOF in multi-line statement", line, (lnum, 0), token_list) continued = 0 while pos < max: pseudomatch = pseudoDFA.recognize(line, pos) if pseudomatch >= 0: # scan for tokens # JDR: Modified start = whiteSpaceDFA.recognize(line, pos) if start < 0: start = pos end = pseudomatch if start == end: # Nothing matched!!! raise TokenError("Unknown character", line, (lnum, start), token_list) pos = end token, initial = line[start:end], line[start] if initial in numchars or \ (initial == '.' and token != '.'): # ordinary number tok = Token(parser, parser.tokens['NUMBER'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial in '\r\n': if parenlev <= 0: tok = Token(parser, parser.tokens['NEWLINE'], token) # XXX YUCK ! tok.value = last_comment token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial == '#': # skip comment last_comment = token elif token in triple_quoted: endDFA = endDFAs[token] endmatch = endDFA.recognize(line, pos) if endmatch >= 0: # all on one line pos = endmatch token = line[start:pos] tok = Token(parser, parser.tokens['STRING'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' else: contstr = line[start:] contline = line break elif initial in single_quoted or \ token[:2] in single_quoted or \ token[:3] in single_quoted: if token[-1] == '\n': # continued string endDFA = (endDFAs[initial] or endDFAs[token[1]] or endDFAs[token[2]]) contstr, needcont = line[start:], 1 contline = line break else: # ordinary string tok = Token(parser, parser.tokens['STRING'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial in namechars: # ordinary name tok = Token(parser, parser.tokens['NAME'], token) if token not in keywords: tok.isKeyword = False token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial == '\\': # continued stmt continued = 1 # lnum -= 1 disabled: count continuation lines separately else: if initial in '([{': parenlev = parenlev + 1 elif initial in ')]}': parenlev = parenlev - 1 if parenlev < 0: raise TokenError("unmatched '%s'" % initial, line, (lnum-1, 0), token_list) if token in parser.tok_values: punct = parser.tok_values[token] tok = Token(parser, punct, None) else: tok = Token(parser, parser.tokens['OP'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' else: start = whiteSpaceDFA.recognize(line, pos) if start < 0: start = pos if start<max and line[start] in single_quoted: raise TokenError("EOL while scanning single-quoted string", line, (lnum, start), token_list) tok = Token(parser, parser.tokens['ERRORTOKEN'], line[pos]) token_list.append((tok, line, lnum, pos)) last_comment = '' pos = pos + 1 lnum -= 1 if not (flags & PyCF_DONT_IMPLY_DEDENT): if token_list and token_list[-1][0].codename != parser.tokens['NEWLINE']: token_list.append((Token(parser, parser.tokens['NEWLINE'], ''), '\n', lnum, 0)) for indent in indents[1:]: # pop remaining indent levels tok = Token(parser, parser.tokens['DEDENT'], '') token_list.append((tok, line, lnum, pos)) #if token_list and token_list[-1][0].codename != pytoken.NEWLINE: token_list.append((Token(parser, parser.tokens['NEWLINE'], ''), '\n', lnum, 0)) tok = Token(parser, parser.tokens['ENDMARKER'], '',) token_list.append((tok, line, lnum, pos)) #for t in token_list: # print '%20s %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2]) #print '----------------------------------------- pyparser/pythonlexer.py' return token_list
def generate_tokens(parser, lines, flags, keywords): """ This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since the original function is not RPYTHON (uses yield) It was also slightly modified to generate Token instances instead of the original 5-tuples -- it's now a 4-tuple of * the Token instance * the whole line as a string * the line number (the real one, counting continuation lines) * the position on the line of the end of the token. Original docstring :: The generate_tokens() generator requires one argment, readline, which must be a callable object which provides the same interface as the readline() method of built-in file objects. Each call to the function should return one line of input as a string. The generator produces 5-tuples with these members: the token type; the token string; a 2-tuple (srow, scol) of ints specifying the row and column where the token begins in the source; a 2-tuple (erow, ecol) of ints specifying the row and column where the token ends in the source; and the line on which the token was found. The line passed is the logical line; continuation lines are included. """ #for line in lines: # print repr(line) #print '------------------- flags=%s ---->' % flags assert isinstance(parser, Parser) token_list = [] lnum = parenlev = continued = 0 namechars = NAMECHARS numchars = NUMCHARS contstr, needcont = '', 0 contline = None indents = [0] last_comment = '' # make the annotator happy pos = -1 lines.append('') # XXX HACK probably not needed # look for the bom (byte-order marker) for utf-8 # make the annotator happy endDFA = automata.DFA([], []) # make the annotator happy line = '' for line in lines: lnum = lnum + 1 pos, max = 0, len(line) if contstr: # continued string if not line: raise TokenError("EOF while scanning triple-quoted string", line, (lnum - 1, 0), token_list) endmatch = endDFA.recognize(line) if endmatch >= 0: pos = end = endmatch tok = Token(parser, parser.tokens['STRING'], contstr + line[:end]) token_list.append((tok, line, lnum, pos)) last_comment = '' # token_list.append((STRING, contstr + line[:end], # strstart, (lnum, end), contline + line)) contstr, needcont = '', 0 contline = None elif (needcont and not line.endswith('\\\n') and not line.endswith('\\\r\n')): tok = Token(parser, parser.tokens['ERRORTOKEN'], contstr + line) token_list.append((tok, line, lnum, pos)) last_comment = '' # token_list.append((ERRORTOKEN, contstr + line, # strstart, (lnum, len(line)), contline)) contstr = '' contline = None continue else: contstr = contstr + line contline = contline + line continue elif parenlev == 0 and not continued: # new statement if not line: break column = 0 while pos < max: # measure leading whitespace if line[pos] == ' ': column = column + 1 elif line[pos] == '\t': column = (column / tabsize + 1) * tabsize elif line[pos] == '\f': column = 0 else: break pos = pos + 1 if pos == max: break if line[pos] in '#\r\n': # skip comments or blank lines continue if column > indents[-1]: # count indents or dedents indents.append(column) tok = Token(parser, parser.tokens['INDENT'], line[:pos]) token_list.append((tok, line, lnum, pos)) last_comment = '' while column < indents[-1]: indents = indents[:-1] tok = Token(parser, parser.tokens['DEDENT'], '') token_list.append((tok, line, lnum, pos)) last_comment = '' else: # continued statement if not line: raise TokenError("EOF in multi-line statement", line, (lnum, 0), token_list) continued = 0 while pos < max: pseudomatch = pseudoDFA.recognize(line, pos) if pseudomatch >= 0: # scan for tokens # JDR: Modified start = whiteSpaceDFA.recognize(line, pos) if start < 0: start = pos end = pseudomatch if start == end: # Nothing matched!!! raise TokenError("Unknown character", line, (lnum, start), token_list) pos = end token, initial = line[start:end], line[start] if initial in numchars or \ (initial == '.' and token != '.'): # ordinary number tok = Token(parser, parser.tokens['NUMBER'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial in '\r\n': if parenlev <= 0: tok = Token(parser, parser.tokens['NEWLINE'], token) # XXX YUCK ! tok.value = last_comment token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial == '#': # skip comment last_comment = token elif token in triple_quoted: endDFA = endDFAs[token] endmatch = endDFA.recognize(line, pos) if endmatch >= 0: # all on one line pos = endmatch token = line[start:pos] tok = Token(parser, parser.tokens['STRING'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' else: contstr = line[start:] contline = line break elif initial in single_quoted or \ token[:2] in single_quoted or \ token[:3] in single_quoted: if token[-1] == '\n': # continued string endDFA = (endDFAs[initial] or endDFAs[token[1]] or endDFAs[token[2]]) contstr, needcont = line[start:], 1 contline = line break else: # ordinary string tok = Token(parser, parser.tokens['STRING'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial in namechars: # ordinary name tok = Token(parser, parser.tokens['NAME'], token) if token not in keywords: tok.isKeyword = False token_list.append((tok, line, lnum, pos)) last_comment = '' elif initial == '\\': # continued stmt continued = 1 # lnum -= 1 disabled: count continuation lines separately else: if initial in '([{': parenlev = parenlev + 1 elif initial in ')]}': parenlev = parenlev - 1 if parenlev < 0: raise TokenError("unmatched '%s'" % initial, line, (lnum - 1, 0), token_list) if token in parser.tok_values: punct = parser.tok_values[token] tok = Token(parser, punct, None) else: tok = Token(parser, parser.tokens['OP'], token) token_list.append((tok, line, lnum, pos)) last_comment = '' else: start = whiteSpaceDFA.recognize(line, pos) if start < 0: start = pos if start < max and line[start] in single_quoted: raise TokenError("EOL while scanning single-quoted string", line, (lnum, start), token_list) tok = Token(parser, parser.tokens['ERRORTOKEN'], line[pos]) token_list.append((tok, line, lnum, pos)) last_comment = '' pos = pos + 1 lnum -= 1 if not (flags & PyCF_DONT_IMPLY_DEDENT): if token_list and token_list[-1][0].codename != parser.tokens[ 'NEWLINE']: token_list.append((Token(parser, parser.tokens['NEWLINE'], ''), '\n', lnum, 0)) for indent in indents[1:]: # pop remaining indent levels tok = Token(parser, parser.tokens['DEDENT'], '') token_list.append((tok, line, lnum, pos)) #if token_list and token_list[-1][0].codename != pytoken.NEWLINE: token_list.append((Token(parser, parser.tokens['NEWLINE'], ''), '\n', lnum, 0)) tok = Token( parser, parser.tokens['ENDMARKER'], '', ) token_list.append((tok, line, lnum, pos)) #for t in token_list: # print '%20s %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2]) #print '----------------------------------------- pyparser/pythonlexer.py' return token_list