-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenizer.py
108 lines (101 loc) · 2.49 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from Token import Token
token_type = {
'class': 'keyword',
'constructor': 'keyword',
'function': 'keyword',
'method': 'keyword',
'field': 'keyword',
'static': 'keyword',
'var': 'keyword',
'int': 'keyword',
'char': 'keyword',
'boolean': 'keyword',
'void': 'keyword',
'true': 'keyword',
'false': 'keyword',
'null': 'keyword',
'this': 'keyword',
'if': 'keyword',
'elif': 'keyword',
'else': 'keyword',
'while': 'keyword',
'return': 'keyword',
'{': 'symbol',
'}': 'symbol',
'(': 'symbol',
')': 'symbol',
'[': 'symbol',
']': 'symbol',
'.': 'symbol',
',': 'symbol',
';': 'symbol',
'+': 'symbol',
'-': 'symbol',
'*': 'symbol',
'/': 'symbol',
'&': 'symbol',
'|': 'symbol',
'^': 'symbol',
'<': 'symbol',
'>': 'symbol',
'=': 'symbol',
'~': 'symbol',
'%': 'symbol',
'_GE_': 'symbol',
'_LE_': 'symbol',
'_NE_': 'symbol',
'_DEC_': 'symbol',
'_INC_': 'symbol',
'_LSHIFT_': 'symbol',
'_RSHIFT_': 'symbol',
'_EQUAL_': 'symbol',
'"': 'quote',
}
space_tokens = [
'(', ')', '[', ']', '{', '}', ';', '"', ',', '.', '~', '+', '-', '*', '/', '<', '>', '=',
'_LE_', '_GE_', '_NE_'
]
long_operators = {
'==': '_EQUAL_', '<=': '_LE_', '>=': '_GE_', '!=': '_NE_'
}
class Tokenizer:
def __init__(self):
self.tokens = []
def tokenize(self, line):
tokens = self.line2tokens(line)
self.tokens.extend(tokens)
return self.tokens
def clear(self):
self.tokens = []
def line2tokens(self, line):
tokens = []
#
# long length operator
#
for op in long_operators:
line = line.replace(op, long_operators[op])
#
# add space around operators
#
for token in space_tokens:
line = line.replace(token, ' ' + token + ' ')
#
# split line into tokens
#
divided_line = line.strip().split()
#
# work with token
#
for item in divided_line:
if item in token_type:
# special token
token_kind = token_type[item]
else:
if item.isdigit():
# integer constant
token_kind = 'integerConstant'
else:
token_kind = 'identifier'
token = Token(token_kind, item)
tokens.append(token)
return tokens