def test_error_info(): tokenize = make_tokenizer([ Spec('keyword', r'(is|end)'), Spec('id', r'[a-z]+'), Spec('space', r'[ \t]+'), Spec('nl', r'[\n\r]+'), ]) try: list(tokenize(u'f is ф')) except LexerError, e: eq_(unicode(e), u'1,6-1,6: cannot tokenize data: "f is \u0444"')
def tokenize(str): 'str -> Sequence(Token)' specs = [ make_multiline_comment(r'/\*', r'\*/'), make_comment(r'//'), newline, space, Spec('name', r'[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*'), Spec('op', r'[{};,=\[\]]|(->)|(--)'), Spec('number', r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)'), Spec('string', r'"[^"]*"'), # '\"' escapes are ignored ] useless = ['comment', 'newline', 'space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), Spec( 'number', r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE), Spec('op', r'[{}\[\]\-,:]'), Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), # NOTE: sometimes number gets into names place thus we shouldn't use them # TODO: consider removing or updating it # Spec('number', r''' # -? # Minus # (0|([1-9][0-9]*)) # Int # (\.[0-9]+)? # Frac # ([Ee][+-][0-9]+)? # Exp # \b''', VERBOSE), Spec('op', r'[{}\(\),;=]'), Spec('comment', r'/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/'), Spec('name', r'[/.A-Za-z_0-9]+'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def test_error_info(): tokenize = make_tokenizer([ Spec('keyword', r'(is|end)'), Spec('id', r'[a-z]+'), Spec('space', r'[ \t]+'), Spec('nl', r'[\n\r]+'), ]) try: list(tokenize('f is ф')) except LexerError as e: pass else: ok_(False, 'must raise LexerError') keyword = lambda s: tok('keyword', s) id = tok('id') is_ = keyword('is') end = keyword('end') nl = tok('nl') equality = id + skip(is_) + id >> tuple expr = equality + skip(nl) file = many(expr) + end msg = """\ rake is eggs eggs isnt spam end""" toks = [x for x in tokenize(msg) if x.type != 'space'] try: file.parse(toks) except ParserError as e: msg, pos, i = e.args eq_(msg, "got unexpected token: id 'spam'") eq_(pos, ((2, 11), (2, 14))) # May raise KeyError t = toks[i] eq_(t, Token('id', 'spam')) else: ok_(False, 'must raise ParserError')
def make_multiline_comment(open, close): return Spec('comment', r'%s(.|[\r\n])*?%s' % (open, close), MULTILINE)
def make_comment(start): return Spec('comment', r'%s.*' % start)
# # The above copyright notice and this permission notice shall be included # in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from re import MULTILINE from funcparserlib.lexer import Spec __all__ = ['make_comment', 'make_multiline_comment', 'newline', 'space'] # Comments def make_comment(start): return Spec('comment', r'%s.*' % start) def make_multiline_comment(open, close): return Spec('comment', r'%s(.|[\r\n])*?%s' % (open, close), MULTILINE) # Common tokens newline = Spec('newline', r'[\r\n]+') space = Spec('space', r'[ \t\r\n]+')
ENCODING = 'utf-8' regexps = { 'escaped': ur''' \\ # Escape ((?P<standard>["\\/bfnrt]) # Standard escapes | (u(?P<unicode>[0-9A-Fa-f]{4}))) # uXXXX ''', 'unescaped': ur''' [\x20-\x21\x23-\x5b\x5d-\uffff] # Unescaped: avoid ["\\] ''', } specs = [ Spec('eol', r'[\r\n]+'), Spec('space', r'\s+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'), Spec('class', r'\.[A-Za-z_][A-Za-z_0-9]*'), Spec('id', r'#[A-Za-z_][A-Za-z_0-9]*'), Spec('eq', r'='), Spec('>', '>'), Spec('<', '<'), ] tokenizer = make_tokenizer(specs) class Eol(object): def __init__(self, data): pass