class Strip(TokenizeContext): class Dot(Token): pattern = "." class Item(TokenizeContext): pass pattern = And(Option(Dot), Item, Option(Many(And(Dot, Item))))
class Indent(TokenizeContext): class Tab(Token): pattern = "\t" class Space2(Token): pattern = " " pattern = Option(Many(Or(Tab, Space2)))
class Number(TokenizeContext): class Spacing(Token): pattern = "_" class LeadingZeros(TokenizeContext): pass class Whole(TokenizeContext): pass class Decimal(TokenizeContext): class Dot(Token): pattern = "." # 000 52_300 . 322_000 pattern = And(Whole, Option(Decimal))
class Block(TokenizeContext): class Start(Token): pattern = "'" pattern = And(Start, Option(Or(And(Space, Option(Str)), Line)))
class Id(TokenizeContext): class Strip(TokenizeContext): class Dot(Token): pattern = "." class Item(TokenizeContext): pass pattern = And(Option(Dot), Item, Option(Many(And(Dot, Item)))) class Special(TokenizeContext): class Char(Token): @classmethod def is_special_symbol_char(cls, l): """ TODO: might not be precise, not so black n white goal: a special symbol may act as a word separator, thus, should not be a "letter", though seemed easier to define it as all unicode chars in "unicode symbol blocks" except eg. emojis and some letters that happend to be there? """ c = ord(l) if c < 0xC0: for r in ('_$ "(){}[]', 'ŠŒŽšœžŸ'): if l in r: return False for r in (range(0x30, 0x39 + 1), range(0x41, 0x5A + 1), range(0x61, 0x7A + 1)): if c in r: return False return True # symbols if c in range(0x2000, 0x2800): for r in ( "⌚⌛", "⏩⏪⏫⏬⏭⏮⏯⏰⏱⏲⏳⏴⏵⏶⏷⏸⏹⏺", "☄☔☕☘☝☠☢☣☦☪☮☯☸♈♉♊♋♌♍♎♏♐♑♒♓♿⚒⚓⚔⚕⚖⚗⚙⚛⚜⚡⚪⚫⚰⚱⚽⚾⛄⛅⛈⛎⛏⛑⛓⛔⛩⛪⛰⛱⛲⛳⛴⛵⛷⛸⛹⛺⛽✅✊✋✌✍✨❌❎❓❔❕❗➕➖➗➰➿" ): if l in r: return False return True if c in range(0x2900, 0x2c00): for r in ("⬛⭐⭕"): if l in r: return False return True if c in range(0x3000, 0x3040): True return False @classmethod def match(cls, linestr, start=0): if start >= len(linestr): return None if linestr.startswith('//', start): return None l = linestr[start] return cls.with_linestr( start, start + 1, linestr) if cls.is_special_symbol_char(l) else None pattern = Many(Char) class Base(Token): allowed_chars = '_$' disallowed_chars = '(){}[]" \t/' disallowedTokens = [] # see below @classmethod def match(cls, linestr, start=0): if start >= len(linestr): return None l = linestr[start] if l in cls.allowed_chars: pass elif l in cls.disallowed_chars: return None else: v, r, ok = match(Or(*cls.disallowedTokens), (linestr, start)) if ok: return None return cls.with_linestr(start, start + 1, linestr) class Start(Base): pass class Middle(Base): pass class Tail(TokenizeContext): @classmethod def process_patternMatch(cls, v): if type(v) is list: middle, *rest = v vs = rest[0].patternMatch if len(rest) == 1 and type( rest[0]) is cls else rest return [middle, *vs] if type(vs) is list else [middle, vs] return v pattern = And(Start, Option(Tail)) @classmethod def process_patternMatch(cls, v): return [v[0]] if not v[1] else [v[0], v[1].patternMatch]
def setPattern(cls): cls.pattern = And(cls.Start, Option(Space.White), Option(Or(Expression, Id.Strip)), Option(Space.White), Option(cls.End))
class Line(TokenizeContext): class Start(Token): pattern = "//" pattern = And(Start, Option(Or(And(Space, Option(Str)), Line))) class Block(TokenizeContext): class Start(Token): pattern = "'" pattern = And(Start, Option(Or(And(Space, Option(Str)), Line))) # file/line Line.pattern = And(Indent, Option(Space.White), Or(Comment.Top, Comment.Block, Expression), Option(Space.White), Option(Comment.Line)) # Logic for tokenizing whole file elsewhere # class File(TokenizeContext): # pattern = Option(Many(Line)) # primitives class String(TokenizeContext): class Start(Token): pattern = '"' class End(Token):