-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lexer.py
executable file
·154 lines (122 loc) · 3.59 KB
/
Lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import genericScanner as Scanner
from genericToken import *
from Symbols import *
from genericCharacter import *
class LexerError(Exception):
pass
# enclose string s in double quotes
def dq(s):
return '"%s"' % s
def initialize(sourceText):
Scanner.initialize(sourceText)
getChar()
def get():
# process unnecessary tokens
while c1 == '#' or c1 in WHITESPACE_CHARS or c2 == "/*":
while c1 == '#':
while c1 != LINE_MARK:
getChar()
getChar()
# process whitespace
while c1 in WHITESPACE_CHARS:
token = Token(character)
token.type = WHITESPACE
getChar()
while c1 in WHITESPACE_CHARS:
token.cargo += c1
getChar()
while c2 == "/*":
token = Token(character)
token.type = COMMENT
token.cargo = c2
getChar() # read past the first character of a 2-character token
getChar() # read past the second character of a 2-character token
while not (c2 == "*/"):
if c1 == ENDMARK:
token.abort("Found end of file before end of comment")
token.cargo += c1
getChar()
token.cargo += c2 # append the */ to the token cargo
getChar()
getChar()
token = Token(character)
if c1 == ENDMARK:
token.type = EOF
return token
if c1 in IDENTIFIER_STARTCHARS:
token.type = IDENTIFIER
getChar()
while c1 in IDENTIFIER_CHARS:
token.cargo += c1
getChar()
if token.cargo in Keywords:
token.type = token.cargo
return token
if c1 in NUMBER_STARTCHARS:
token.type = NUMBER
getChar()
while c1 in NUMBER_CHARS:
token.cargo += c1
getChar()
return token
if c1 in STRING_STARTCHARS:
# remember the quoteChar (single or double quote)
# so we can look for the same character to terminate the quote.
quoteChar = c1
getChar()
while c1 != quoteChar:
if c1 == ENDMARK:
token.abort("Found end of file before end of string literal")
token.cargo += c1 # append quoted character to text
getChar()
token.cargo += c1 # append close quote to text
getChar()
token.type = STRING
return token
if c2 in TwoCharacterSymbols:
token.cargo = c2
token.type = token.cargo # for symbols, the token type is same as the cargo
getChar() # read past the first character of a 2-character token
getChar() # read past the second character of a 2-character token
return token
if c1 in OneCharacterSymbols:
if c1 == '<':
getChar()
if c2 == '<<':
token.cargo = '<<<'
token.type = token.cargo
getChar()
getChar()
return token
if c2 == '->':
token.cargo = '<->'
token.type = token.cargo
getChar()
getChar()
return token
if c1 == '>':
getChar()
if c2 == '>>':
token.cargo = '>>>'
token.type = token.cargo
getChar()
getChar()
return token
token.type = token.cargo # for symbols, the token type is same as the cargo
getChar() # read past the symbol
return token
# else.... We have encountered something that we don't recognize.
token.abort("I found a character or symbol that I do not recognize: " + dq(c1))
def getChar():
"""
get the next character
"""
global c1, c2, character
character = Scanner.get()
c1 = character.cargo
#---------------------------------------------------------------
# Every time we get a character from the scanner, we also
# lookahead to the next character and save the results in c2.
# This makes it easy to lookahead 2 characters.
#---------------------------------------------------------------
c2 = c1 + Scanner.lookahead(1)