def get_tokens(text, encoding=None): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism is bypassed even if filters are defined. Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. Split ``text`` into (tokentype, text) pairs. ``stack`` is the inital stack (default: ``['root']``) """ encoding = encoding or 'utf-8' if isinstance(text, string_types): text = StringIO(text) text = text.read() if not isinstance(text, text_type): try: text = text.decode(encoding) except UnicodeDecodeError: text = text.decode('unicode-escape') iterable = enumerate(text) for pos, char in iterable: for rexmatch, action in SQL_REGEX: m = rexmatch(text, pos) if not m: continue elif isinstance(action, tokens._TokenType): yield action, m.group() elif callable(action): yield action(m.group()) consume(iterable, m.end() - pos - 1) break else: yield tokens.Error, char