-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
111 lines (96 loc) · 3.82 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from token import Token
class StringTokenizer:
def __init__(self, text='', tokentype=None, keyword=None):
self.text = text
self.tokentype = tokentype
self.keyword = keyword
self.pos = 0 # act as a cursor within self.text
self.current_char = self.text[self.pos]
def advance(self):
"""
increment self.pos by 1.
if self.pos is greater than length of self.text
set self.current_char to character on index
self.pos in self.text
else
set self.current_char to None
"""
self.pos += 1
if self.pos < len(self.text):
self.current_char = self.text[self.pos]
else:
self.current_char = None
def skip_whitespace(self):
"""Skips spaces in the text that are not quoted"""
while self.current_char is not None and self.current_char.isspace():
self.advance()
def get_integer(self):
"""returns an integer when called"""
string = ''
while self.current_char is not None and self.current_char.isdigit():
string += self.current_char
self.advance()
return int(string)
def number(self):
"""recognizes and returns an integer or float token"""
integer = self.get_integer()
if self.current_char == '.':
self.advance()
floating = float(str(integer) + str(self.get_integer()))
return Token(self.tokentype['FLOAT'], floating)
return Token(self.tokentype['INT'], integer)
def identifier(self):
"""recognizes and returns an identifier token"""
_id = ''
while self.current_char is not None and self.current_char.isalpha():
# inner loop to get alphanumeric characters
while self.current_char is not None and\
self.current_char.isalnum():
_id += self.current_char
self.advance()
return Token(self.tokentype['ID'], _id)
def string(self):
"""recognizes and returns a string token"""
_string = ''
while self.current_char != '"':
_string += self.current_char
self.advance()
# return CHARACTER token if length of string is less than 2
if len(_string) == 1:
return Token(self.tokentype['CHAR'], _string)
return Token(self.tokentype['STRING'], _string)
def generic_token(self, character):
"""returns single character tokens"""
return Token(self.tokentype[character], character)
def create_token_generator(self):
"""
move through the text and generates a list of tokens
return token_list:list
"""
while self.current_char is not None:
# handle whitespaces
if self.current_char.isspace():
self.skip_whitespace()
continue
# handle integer and float numbers
if self.current_char.isdigit():
yield self.number()
continue
# handle identifiers e.g variable names
if self.current_char.isalpha():
yield self.identifier()
continue
# handle strings e.g "Hello, World"
if self.current_char == '"':
self.advance() # skip opening quote
yield self.string()
self.advance() # skip closing quote
continue
# handle single characters e.g symbols
if self.current_char in self.tokentype.keys():
char = self.current_char
self.advance()
yield self.generic_token(char)
continue
# add token to indicate end of file (EOF)
yield Token(self.tokentype['EOF'], None)