def tokenize_string_literal(self): """ Tokenize a text string literal, yields Token(TokenKind.STRING_LITERAL, 1, 7) for stream containing '"string"'. """ yield from self.tokenize_delimiter() start = self.stream.tell() read_char = self.stream.read(1) if read_char == '"': literal_start = self.stream.tell() read_char = self.stream.read(1) literal_end = self.stream.tell() while read_char and read_char not in '"': literal_end = self.stream.tell() read_char = self.stream.read(1) if not read_char: self.stream.seek(start) raise TokenizationError( "Reached end of stream while reading string literal" ) yield Token(TokenKind.STRING_LITERAL, literal_start, literal_end) else: self.stream.seek(start) raise TokenizationError(f"Expected string at {start}")
def tokenize_delimiter(self): start = self.stream.tell() read_char = self.stream.read(1) if read_char != b"\0": self.stream.seek(start) raise TokenizationError(f"Expected delimiter at {start}") return iter([])
def binary_ended_tokenizer(): tok = next(tokenizer()) start = self.stream.tell() read_char = self.stream.read(1) if read_char != b"\0": self.stream.seek(start) raise TokenizationError(f"Expected delimiter at {start}") yield tok
def tokenize_end_of_file(self): start = self.stream.tell() r = self.stream.read(1) if r: self.stream.seek(start) raise TokenizationError( f"Expected end of file or new tag at {start} got {r}" ) return iter([])
def tokenize_end_of_file(self): yield from self.tokenize_delimiter() start = self.stream.tell() read_char = self.stream.read(1) if read_char: self.stream.seek(start) raise TokenizationError( f"Expected end of file or new tag at {start} got {read_char}" )
def word_tokenizer(): start = stream.tell() token = stream.read(word_len) if token == word: end = stream.tell() yield Token(kind, end - word_len, end) else: stream.seek(start) raise TokenizationError(f"Token {repr(token)} did not match {word}")
def tokenize_comment(self): """ Tokenize a comment from the start of stream. Note: does not actually yield a token for the comment, just raises StopIteration after taking a comment. """ start = self.stream.tell() read_char = self.stream.read(1) if read_char == "#": read_char = self.stream.read(1) while read_char and read_char != "#": read_char = self.stream.read(1) if not read_char: self.stream.seek(start) raise TokenizationError("Reached end of stream while reading comment") return iter([]) else: self.stream.seek(start) raise TokenizationError(f"Expected comment at {start}")
def tokenize_space(self): start = self.stream.tell() read_char = self.stream.read(1) if not read_char.isspace(): self.stream.seek(start) raise TokenizationError(f"Expected space at start, got {read_char}") while read_char.isspace(): first_non_space = self.stream.tell() read_char = self.stream.read(1) self.stream.seek(first_non_space) return iter([])
def tokenize_header(stream): start = stream.tell() header = stream.read(8) if header == b"roff-bin": read_char = stream.read(1) if read_char != b"\0": stream.seek(start) raise TokenizationError( f"Expected delimiter after header token got {read_char}") yield Token(TokenKind.ROFF_BIN, start, start + 8) elif header == "roff-asc": yield Token(TokenKind.ROFF_ASC, start, start + 8) elif header == b"roff-asc": raise WrongFileModeError( "Ascii formatted roff file was opened in binary mode!") elif header == "roff-bin": raise WrongFileModeError( "Binary formatted roff file was opened in text mode!") else: stream.seek(start) raise TokenizationError(f"Did not find roff header, got {header}.")
def tokenizer(): start = self.stream.tell() last_alnum = start read_char = self.stream.read(1) while read_char and not read_char == b"\0": last_alnum = self.stream.tell() read_char = self.stream.read(1) if read_char != b"\0": self.stream.seek(start) raise TokenizationError( f"could not tokenize string at {start}") else: yield Token(kind, start, last_alnum)
def one_of_tokenizer(): did_yield = False errors = [] for tok in tokenizers: try: yield from tok() did_yield = True break except TokenizationError as err: errors.append(str(err)) if not did_yield: raise TokenizationError( "Tokenization failed, due to one of\n*" + ("\n*".join(errors)) )
def tokenize_name(self): yield from self.tokenize_delimiter() start = self.stream.tell() length = 0 read_char = self.stream.read(1) while read_char and not read_char.isspace(): length += 1 read_char = self.stream.read(1) if length < 1: self.stream.seek(start) raise TokenizationError(f"could not tokenize name at {start}") yield Token(TokenKind.NAME, start, start + length)
def tokenize_numeric_value(self): """ Tokenize any text numeric value, yields Token(TokenKind.NUMERIC_VALUE, 0, 3) for stream containing "1.0". """ yield from self.tokenize_delimiter() start = self.stream.tell() end = start read_char = self.stream.read(1) if read_char and read_char.isnumeric() or read_char == "-": while read_char and (read_char.isnumeric() or read_char in ".eE+-"): end = self.stream.tell() read_char = self.stream.read(1) if end - start < 1: self.stream.seek(start) raise TokenizationError(f"Expected numeric value at {start}") else: self.stream.seek(end) yield Token(TokenKind.NUMERIC_VALUE, start, end)
def tokenlen(tokenkind): """ For fixed bytesize types return the number of bytes used to store that type. :param tokenkind: A fixed bytesize type, eg. TokenKind.BOOL. :returns: The number of bytes used for that type. """ if tokenkind == TokenKind.BOOL: return 1 elif tokenkind == TokenKind.BYTE: return 1 elif tokenkind == TokenKind.INT: return 4 elif tokenkind == TokenKind.FLOAT: return 4 elif tokenkind == TokenKind.DOUBLE: return 8 else: raise TokenizationError( f"Attempted to read non-fixed size type {tokenkind}")