def scan_field_id(token, state, stream, lexdata): # When a candidate is found, do as follows: # - save the candidate # - eat any whitespace # - if next is colon, candidate is an identifier, emit both # identifier and colon candidate = token token = stream.peek() if token.type == "WS": token = stream.peek() if token.type == "COLON": # We do have a identifier, so replace WORD token by the # right keyword token candidate = _new_token(META_FIELDS_ID[candidate.value], candidate) try: field_type = FIELD_TYPE[candidate.type] except KeyError: data = lexdata.splitlines() msg = ["Error while tokenizing %r (missing colon ?)" % candidate.value] msg += [" Line %d -> %r" % (candidate.lineno, data[candidate.lineno-1])] raise SyntaxError("\n".join(msg)) try: state = _FIELD_TYPE_TO_STATE[field_type] except KeyError: raise ValueError("Unknown state transition for type %s" % field_type) queue = [candidate] queue.append(gen_next(stream)) nxt = gen_next(stream) return queue, nxt, state
def _skip_ws(tok, stream, state, internal): while tok.type in ["NEWLINE", "WS"]: if tok.type == "NEWLINE" and len(internal.words_stack) == 0: nxt = stream.peek() if not nxt.type == "INDENT": state = "SCANNING_FIELD_ID" else: tok = gen_next(stream) return tok, state tok = gen_next(stream) return tok, state
def skip_until_eol(stream, t): try: prev = stream.previous() except ValueError: prev = None while t.type != "NEWLINE": t = gen_next(stream) # FIXME: ideally, we would like to remove EOL for comments which span the # full line, but we need access to the token before the comment delimiter # to do so, as we don't want to remove EOL for inline commeng (e.g. 'foo # # comment') if prev and t.type == "NEWLINE" and prev.type in ('NEWLINE', 'INDENT'): t = gen_next(stream) return t
def next(self): c = gen_next(self._gen) if len(self._cache) == 2: old, new = self._cache self._cache = [new] self._cache.append(c) return c
def _peek_no_dummy(self): if self._cache: return self._cache else: i = gen_next(self._it) self._cache = i return i
def next(self): if self._cache: i = self._cache self._cache = None return i else: return gen_next(self._it)
def multiline_tokenizer(token, state, stream, internal): stack = internal.stack queue = [] if internal.stack_level is None: internal.stack_level = [len(internal.stack)] stack_level = internal.stack_level if token.type == "INDENT": stack.append(token) queue.insert(0, token) elif token.type == "DEDENT": stack.pop(0) if len(stack) < 1: state = "SCANNING_FIELD_ID" queue.insert(0, token) elif token.type == "NEWLINE": saved_newline = token # Case where there is a single, non indented line for the field, i.e.: # Description: a description if (len(stack) == stack_level[0] and stream.peek().type != "INDENT"): state = "SCANNING_FIELD_ID" internal.stack_level = None elif stream.peek().type == "DEDENT": try: while stream.peek().type == "DEDENT": token = gen_next(stream) queue.insert(0, token) stack.pop() except StopIteration: pass if len(stack) == stack_level[0]: state = "SCANNING_FIELD_ID" internal.stack_level = None else: queue.append(saved_newline) else: queue.insert(0, token) else: queue.insert(0, token) try: token = gen_next(stream) except StopIteration: token = None return queue, token, state
def _peek_dummy(self): if self._cache: return self._cache else: try: i = gen_next(self._it) except StopIteration: return self._dummy self._cache = i return i
def tokenize_conditional(stream, token): ret = [] token.type = CONDITIONAL_ID[token.value] ret.append(token) queue = [] nxt = stream.peek() if not nxt.type in ["COLON", "NEWLINE"]: while nxt.type not in ["COLON", "NEWLINE"]: if nxt.type not in ["WS"]: queue.append(nxt) nxt = gen_next(stream) queue.append(nxt) for q in queue: if q.value in CONDITIONAL_ID.keys(): q.type = CONDITIONAL_ID[q.value] ret.append(q) return ret, gen_next(stream)
def singleline_tokenizer(token, state, stream): if token.type == "NEWLINE": state = "SCANNING_FIELD_ID" queue = [] else: queue = [token] try: tok = gen_next(stream) except StopIteration: tok = None return queue, tok, state
def word_tokenizer(token, state, stream): queue = [] state = "SCANNING_FIELD_ID" try: while token.type != "NEWLINE": if token.type == "WORD": queue.append(token) token = gen_next(stream) except StopIteration: token = None return queue, token, state
def comma_list_tokenizer(token, state, stream, internal): queue = [] state = "SCANNING_FIELD_ID" def _filter_ws_before_comma(lst): ret = [] for i, item in enumerate(lst): if item.type == "WS": if i < len(lst) and lst[i+1].type == "COMMA": pass elif i > 0 and lst[i-1].type == "COMMA": pass else: ret.append(item) else: ret.append(item) return ret try: if token.type != "NEWLINE": token, state = _skip_ws(token, stream, state, internal) while token.type not in ("NEWLINE",): queue.append(token) token = gen_next(stream) # Eat newline token = gen_next(stream) if token.type == "INDENT": internal.stack.append(token) while token.type != "DEDENT": if token.type != "NEWLINE": queue.append(token) token = gen_next(stream) if token.type == "DEDENT": internal.stack.pop(0) queue.append(token) return _filter_ws_before_comma(queue), gen_next(stream), state except StopIteration: return _filter_ws_before_comma(queue), None, "EOF"
def merge_escaped(stream): stream = Peeker(stream, EOF) queue = [] t = gen_next(stream) while t: if t.escaped: queue.append(t) else: if t.type == "WORD": if queue: queue.append(t) n = stream.peek() if not n.escaped: t.value = "".join([c.value for c in queue]) yield t queue = [] else: n = stream.peek() if n.escaped: queue.append(t) else: yield t else: if queue: queue[-1].value = "".join([c.value for c in queue]) queue[-1].type = "WORD" yield queue[-1] queue = [] yield t try: t = gen_next(stream) except StopIteration: if queue: t.value = "".join([c.value for c in queue]) t.type = "WORD" yield t return
def detect_escaped(stream): """Post process the given stream to generate escaped character for characters preceded by the escaping token.""" for t in stream: if ESCAPING_CHAR[t.type]: try: t = gen_next(stream) except StopIteration: raise SyntaxError("EOF while escaping token %r (line %d)" % (t.value, t.lineno-1)) t.escaped = True else: t.escaped = False yield t
def post_process(stream, lexdata): # XXX: this is awfully complicated... class _Internal(object): def __init__(self): self.stack = [] self.words_stack = [] self.stack_level = None internal = _Internal() state = "SCANNING_FIELD_ID" stream = Peeker(stream) i = gen_next(stream) while i: if state == "SCANNING_FIELD_ID": if i.value in CONDITIONAL_ID.keys(): queue, i = tokenize_conditional(stream, i) for q in queue: yield q elif i.value in META_FIELDS_ID.keys(): queue, i, state = scan_field_id(i, state, stream, lexdata) for q in queue: yield q else: queue, i = find_next(i, stream, internal) for q in queue: yield q elif state == "SCANNING_SINGLELINE_FIELD": queue, i, state = singleline_tokenizer(i, state, stream) for q in queue: yield q elif state == "SCANNING_MULTILINE_FIELD": queue, i, state = multiline_tokenizer(i, state, stream, internal) while len(queue) > 0: yield queue.pop() elif state == "SCANNING_WORD_FIELD": queue, i, state = word_tokenizer(i, state, stream) for t in queue: yield t elif state == "SCANNING_WORDS_FIELD": queue, i, state = words_tokenizer(i, state, stream, internal) for q in queue: yield q elif state == "SCANNING_COMMA_LIST_FIELD": queue, i, state = comma_list_tokenizer(i, state, stream, internal) for q in queue: yield q else: raise ValueError("Unknown state: %s" % state)
def find_next(token, stream, internal): queue = [] if token.type != "NEWLINE": if token.type == "INDENT": internal.stack.append(token) elif token.type == "DEDENT": internal.stack.pop(0) queue.append(token) try: tok = gen_next(stream) except StopIteration: tok = None return queue, tok
def words_tokenizer(token, state, stream, internal): token, state = _skip_ws(token, stream, state, internal) if state == "SCANNING_WORDS_FIELD": words_stack = internal.words_stack if token.type == "INDENT": words_stack.append(token) elif token.type == "DEDENT": words_stack.pop(0) if len(words_stack) < 1: state = "SCANNING_FIELD_ID" internal.words_stack = [] queue = [token] else: queue = [] try: tok = gen_next(stream) except StopIteration: tok = None return queue, tok, state
def token(self, *a, **kw): try: return gen_next(self.token_stream) except StopIteration: return None
def indent_generator(toks): """Post process the given stream of tokens to generate INDENT/DEDENT tokens. Note ---- Each generated token's value is the total amount of spaces from the beginning of the line. The way indentation tokens are generated is similar to how it works in python.""" stack = [0] # Dummy token to track the token just before the current one former = LexToken() former.type = "NEWLINE" former.value = "dummy" former.lineno = 0 former.lexpos = -1 def generate_dedent(stck, tok): amount = stck.pop(0) return new_dedent(amount, tok) for token in toks: if former.type == "NEWLINE": if token.type == "WS": indent = len(token.value) else: indent = 0 if indent == stack[0]: former = token if indent > 0: token = gen_next(toks) former = token yield token else: yield former elif indent > stack[0]: stack.insert(0, indent) ind = new_indent(indent, token) former = ind yield ind elif indent < stack[0]: if not indent in stack: raise ValueError("Wrong indent at line %d" % token.lineno) while stack[0] > indent: former = generate_dedent(stack, token) yield former if stack[0] > 0: former = gen_next(toks) yield former else: former = token yield token else: former = token yield token # Generate additional DEDENT so that the number of INDENT/DEDENT always # match while len(stack) > 1: former = generate_dedent(stack, token) yield former