def tokenize(self, string): keywords = set([ 'BA_', 'BA_DEF_', 'BA_DEF_DEF_', 'BA_DEF_DEF_REL_', 'BA_DEF_REL_', 'BA_DEF_SGTYPE_', 'BA_REL_', 'BA_SGTYPE_', 'BO_', 'BO_TX_BU_', 'BS_', 'BU_', 'BU_BO_REL_', 'BU_EV_REL_', 'BU_SG_REL_', 'CAT_', 'CAT_DEF_', 'CM_', 'ENVVAR_DATA_', 'EV_', 'EV_DATA_', 'FILTER', 'NS_', 'NS_DESC_', 'SG_', 'SG_MUL_VAL_', 'SGTYPE_', 'SGTYPE_VAL_', 'SIG_GROUP_', 'SIG_TYPE_REF_', 'SIG_VALTYPE_', 'SIGTYPE_VALTYPE_', 'VAL_', 'VAL_TABLE_', 'VERSION' ]) names = { 'LPAREN': '(', 'RPAREN': ')', 'LBRACE': '[', 'RBRACE': ']', 'COMMA': ',', 'AT': '@', 'SCOLON': ';', 'COLON': ':', 'PIPE': '|', 'SIGN': '+/-' } token_specs = [('SKIP', r'[ \r\n\t]+|//.*?\n'), ('NUMBER', r'-?\d+\.?\d*([eE][+-]?\d+)?'), ('WORD', r'[A-Za-z0-9_]+'), ('STRING', r'"(\\"|[^"])*?"'), ('LPAREN', r'\('), ('RPAREN', r'\)'), ('LBRACE', r'\['), ('RBRACE', r'\]'), ('COMMA', r','), ('PIPE', r'\|'), ('AT', r'@'), ('SIGN', r'[+-]'), ('SCOLON', r';'), ('COLON', r':'), ('MISMATCH', r'.')] tokens, token_regex = tokenize_init(token_specs) for mo in re.finditer(token_regex, string, re.DOTALL): kind = mo.lastgroup if kind == 'SKIP': pass elif kind == 'STRING': value = mo.group(kind)[1:-1].replace('\\"', '"') tokens.append(Token(kind, value, mo.start())) elif kind != 'MISMATCH': value = mo.group(kind) if value in keywords: kind = value if kind in names: kind = names[kind] tokens.append(Token(kind, value, mo.start())) else: raise TokenizeError(string, mo.start()) return tokens
def tokenize(items, add_eof_token=True): tokens = [] for item in items: if len(item) == 2: token = Token(*item, offset=1) else: token = Token(*item) tokens.append(token) if add_eof_token: tokens.append(Token('__EOF__', None, -1)) return tokens
def tokenize(self, string): token_specs = [('SKIP', r'\r+|\s*\|[^\n]*'), ('NL', r'\n'), ('KEYWORD', r'\[.+?\]'), ('WORD', r'[^ \n\t\r\f\v=]+'), ('WS', r'[ \t\r\f\v]+'), ('EQ', r'='), ('MISMATCH', r'.')] tokens, token_regex = tokenize_init(token_specs) for mo in re.finditer(token_regex, string, re.DOTALL): kind = mo.lastgroup if kind == 'SKIP': pass elif kind != 'MISMATCH': value = mo.group(kind) if kind == 'KEYWORD': keyword = value.lower().replace('_', ' ') if keyword in KEYWORDS: kind = keyword tokens.append(Token(kind, value, mo.start())) else: raise TokenizeError(string, mo.start()) return tokens
def test_create_token_re(self): datas = [([('A', r'a')], '(?P<A>a)'), ([('A', r'b'), ('C', r'd')], '(?P<A>b)|(?P<C>d)')] for spec, expected_re_token in datas: tokens, re_token = tokenize_init(spec) self.assertEqual( tokens, [Token(kind='__SOF__', value='__SOF__', offset=0)]) self.assertEqual(re_token, expected_re_token)
def tokenize(self, string): names = { 'LPAREN': '(', 'RPAREN': ')', 'LBRACE': '[', 'RBRACE': ']', 'COMMA': ',', 'ASSIGN': '=', 'ENUMS': '{ENUMS}', 'SIGNALS': '{SIGNALS}', 'SEND': '{SEND}', 'RECEIVE': '{RECEIVE}', 'SENDRECEIVE': '{SENDRECEIVE}', 'U': '/u:', 'F': '/f:', 'O': '/o:', 'MIN': '/min:', 'MAX': '/max:', 'D': '/d:', 'LN': '/ln:', 'E': '/e:', 'P': '/p:', 'M': '-m', 'H': '-h', 'B': '-b', 'S': '-s', 'T': '-t', 'V': '-v' } re_string = r'"(\\"|[^"])*?"' token_specs = [('SKIP', r'[ \r\n\t]+'), ('COMMENT', r'//.*?\n'), ('NUMBER', r'-?\d+\.?[0-9A-F]*([eE][+-]?\d+)?'), ('STRING', re_string), ('U', r'/u:({}|\S+)'.format(re_string)), ('F', r'/f:'), ('O', r'/o:'), ('MIN', r'/min:'), ('MAX', r'/max:'), ('D', r'/d:'), ('LN', r'/ln:'), ('E', r'/e:'), ('P', r'/p:'), ('M', r'\-m'), ('H', r'\-h'), ('B', r'\-b'), ('S', r'\-s'), ('T', r'\-t'), ('V', r'\-v'), ('LPAREN', r'\('), ('RPAREN', r'\)'), ('LBRACE', r'\['), ('RBRACE', r'\]'), ('COMMA', r','), ('ASSIGN', r'='), ('ENUMS', r'\{ENUMS\}'), ('SIGNALS', r'\{SIGNALS\}'), ('SEND', r'\{SEND\}'), ('RECEIVE', r'\{RECEIVE\}'), ('SENDRECEIVE', r'\{SENDRECEIVE\}'), ('WORD', r'[^\s=\(\]\-]+'), ('MISMATCH', r'.')] tokens, token_regex = tokenize_init(token_specs) for mo in re.finditer(token_regex, string, re.DOTALL): kind = mo.lastgroup if kind == 'SKIP': pass elif kind == 'STRING': value = mo.group(kind)[1:-1].replace('\\"', '"') tokens.append(Token(kind, value, mo.start())) elif kind != 'MISMATCH': value = mo.group(kind) if value in self.KEYWORDS: kind = value if kind in names: kind = names[kind] tokens.append(Token(kind, value, mo.start())) else: raise TokenizeError(string, mo.start()) return tokens
def test_parser_default_keywords(self): class Parser(textparser.Parser): def token_specs(self): return [('SKIP', r'[ \r\n\t]+'), ('NUMBER', r'-?\d+(\.\d+)?([eE][+-]?\d+)?'), ('DOT', '.', r'\.'), ('WORD', r'[A-Za-z0-9_]+'), ('ESCAPED_STRING', r'"(\\"|[^"])*?"'), ('MISMATCH', r'.')] def grammar(self): return Sequence( 'WORD', Optional('WORD'), 'ESCAPED_STRING', 'WORD', Optional( choice(DelimitedList('ESCAPED_STRING'), ZeroOrMore('NUMBER'))), '.') datas = [('IF "foo" bar .', ['IF', [], '"foo"', 'bar', [[]], '.'], [ Token(kind='WORD', value='IF', offset=0), [], Token(kind='ESCAPED_STRING', value='"foo"', offset=3), Token(kind='WORD', value='bar', offset=9), [[]], Token(kind='.', value='.', offset=13) ]), ('IF B "" b 1 2 .', ['IF', ['B'], '""', 'b', [['1', '2']], '.'], [ Token(kind='WORD', value='IF', offset=0), [Token(kind='WORD', value='B', offset=3)], Token(kind='ESCAPED_STRING', value='""', offset=5), Token(kind='WORD', value='b', offset=8), [[ Token(kind='NUMBER', value='1', offset=10), Token(kind='NUMBER', value='2', offset=12) ]], Token(kind='.', value='.', offset=14) ])] for text, expected_tree, expected_token_tree in datas: tree = Parser().parse(text) self.assertEqual(tree, expected_tree) tree = Parser().parse(text, token_tree=True) self.assertEqual(tree, expected_token_tree)
def tokenize(self, string): keywords = set([ 'FormatVersion', 'Title', 'Enum', 'Sig', 'ID', 'Len', 'Mux', 'CycleTime', 'Timeout', 'MinInterval', 'Sig', ]) names = { 'LPAREN': '(', 'RPAREN': ')', 'LBRACE': '[', 'RBRACE': ']', 'COMMA': ',', 'ASSIGN': '=', 'ENUMS': '{ENUMS}', 'SIGNALS': '{SIGNALS}', 'SEND': '{SEND}', 'RECEIVE': '{RECEIVE}', 'SENDRECEIVE': '{SENDRECEIVE}', 'U': '/u:', 'F': '/f:', 'O': '/o:', 'MIN': '/min:', 'MAX': '/max:', 'D': '/d:', 'LN': '/ln:', 'E': '/e:', 'M': '-m' } token_specs = [('SKIP', r'[ \r\n\t]+|//.*?\n'), ('NUMBER', r'-?\d+\.?\d*([eE][+-]?\d+)?'), ('WORD', r'[A-Za-z0-9_\*]+'), ('STRING', r'"(\\"|[^"])*?"'), ('LPAREN', r'\('), ('RPAREN', r'\)'), ('LBRACE', r'\['), ('RBRACE', r'\]'), ('COMMA', r','), ('ASSIGN', r'='), ('ENUMS', r'\{ENUMS\}'), ('SIGNALS', r'\{SIGNALS\}'), ('SEND', r'\{SEND\}'), ('RECEIVE', r'\{RECEIVE\}'), ('SENDRECEIVE', r'\{SENDRECEIVE\}'), ('U', r'/u:'), ('F', r'/f:'), ('O', r'/o:'), ('MIN', r'/min:'), ('MAX', r'/max:'), ('D', r'/d:'), ('LN', r'/ln:'), ('E', r'/e:'), ('M', r'\-m'), ('MISMATCH', r'.')] tokens, token_regex = tokenize_init(token_specs) for mo in re.finditer(token_regex, string, re.DOTALL): kind = mo.lastgroup if kind == 'SKIP': pass elif kind == 'STRING': value = mo.group(kind)[1:-1].replace('\\"', '"') tokens.append(Token(kind, value, mo.start())) elif kind != 'MISMATCH': value = mo.group(kind) if value in keywords: kind = value if kind in names: kind = names[kind] tokens.append(Token(kind, value, mo.start())) else: raise TokenizeError(string, mo.start()) return tokens