-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
77 lines (63 loc) · 2.15 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/python
# -*- coding: utf-8 -*-
from plex import *
import patterns
from token import Token
import string
letter = Range("AZaz") | Str("ą", "ć", "ę", "ł","ń","ó",
"ś","ż","ź","Ą","Ć","Ę","Ł","Ń","Ó","Ś","Ż","Ź")
identCharacter = Str("_", "-", ".")
digit = Range("09")
# Tokens:
identifier = letter + Rep(letter | digit | identCharacter)
number = Rep1(digit)
comment = Str("<!--") + Rep(AnyChar) + Str("-->")
symbol = Str("</", "/>", "<?", "?>", "<![", "]]>", '="')
hexadecimal = Rep1(digit | Range('AFaf'))
keyword = Str("CDATA")
# Character reference or named entity, i.e. Â, , >
# Possible formats:
# &#nnnn, &#xhhhh, &name
# where: n - code point in decimal form
# h - code point in hexadecimal form
# name - name of entity
numericCharRef = ((Str('&#') + Rep1(digit)) + Str(';') |
(Str('&#x') + Seq(hexadecimal, hexadecimal, hexadecimal, hexadecimal)) + Str(';') |
(Str('&') + identifier + Str(';')))
# otherSymbol is any char, which is not an an indentifier, a keyword,
# a symbol, a number, a whitespace or a comment
otherSymbol = AnyChar
class Lexer(Scanner):
lexicon = Lexicon([
(keyword, TEXT),
(patterns.close_punctuation, TEXT),
(patterns.open_punctuation, TEXT),
(patterns.connector_punctuation, TEXT),
(patterns.dash_punctuation, TEXT),
(patterns.currency_symbol, TEXT),
(patterns.initial_punctuation, TEXT),
(patterns.final_punctuation, TEXT),
(patterns.other_punctuation_1, TEXT),
(patterns.other_punctuation_2, TEXT),
(numericCharRef, 'xmlHtmlEntity'),
(identifier,'ident'),
(number, 'number'),
(patterns.whitespaces, IGNORE),
(comment, IGNORE),
(symbol, TEXT),
(otherSymbol, TEXT)
])
def __init__(self, stream):
Scanner.__init__(self, self.lexicon, stream)
def initialize(stream):
global lexer
lexer = Lexer(stream)
def read():
temp = lexer.read()
token = Token(temp[0])
token.content = temp[1]
return token
def read2():
return lexer.read()
def position():
return lexer.position()