def add_chunk(chunk, tokens): """Convert chunk into a token if possible and add to tokens. If chunk is non-empty but cannot be made into a token, this function records a compiler error. We don't need to check for symbol kind tokens here because they are converted before they are shifted into the chunk. chunk - Chunk to convert into a token, as list of Tagged characters. tokens (List[Token]) - List of the tokens thusfar parsed. """ if chunk: range = Range(chunk[0].p, chunk[-1].p) keyword_kind = match_keyword_kind(chunk) if keyword_kind: tokens.append(Token(keyword_kind, r=range)) return number_string = match_number_string(chunk) if number_string: tokens.append(Token(token_kinds.number, number_string, r=range)) return identifier_name = match_identifier_name(chunk) if identifier_name: tokens.append(Token( token_kinds.identifier, identifier_name, r=range)) return descrip = "unrecognized token at '{}'".format(chunk_to_str(chunk)) raise CompilerError(descrip, range)
def __init__(self, message, index, tokens, message_type): """Initialize a ParserError from the given arguments. message (str) - Base message to put in the error. tokens (List[Token]) - List of tokens. index (int) - Index of the offending token. message_type (int) - One of self.AT, self.GOT, or self.AFTER. Example: ParserError("unexpected semicolon", 10, [...], self.AT) -> CompilerError("unexpected semicolon at ';'", ..., ...) -> "main.c:10: unexpected semicolon at ';'" """ self.amount_parsed = index if len(tokens) == 0: super().__init__("{} at beginning of source".format(message)) return # If the index is too big, we're always using the AFTER form if index >= len(tokens): index = len(tokens) message_type = self.AFTER # If the index is too small, we should not use the AFTER form elif index <= 0: index = 0 if message_type == self.AFTER: message_type = self.GOT if message_type == self.AT: super().__init__("{} at '{}'".format(message, tokens[index]), tokens[index].r) elif message_type == self.GOT: super().__init__("{}, got '{}'".format(message, tokens[index]), tokens[index].r) elif message_type == self.AFTER: if tokens[index - 1].r: new_range = Range(tokens[index - 1].r.end + 1) else: new_range = None super().__init__( "{} after '{}'".format(message, tokens[index - 1]), new_range)
def __init__(self, c, p): """Initialize object.""" self.c = c self.p = p self.r = Range(p, p)
def tokenize_line(line, in_comment): """Tokenize the given single line. line - List of Tagged objects. in_comment - Whether the first character in this line is part of a C-style comment body. return - List of Token objects, and boolean indicating whether the next character is part of a comment body. """ tokens = [] # line[chunk_start:chunk_end] is the section of the line currently # being considered for conversion into a token; this string will be # called the 'chunk'. Everything before the chunk has already been # tokenized, and everything after has not yet been examined chunk_start = 0 chunk_end = 0 # Flag that is set True if the line begins with `#` and `include`, # perhaps with comments and whitespace in between. include_line = False # Flag that is set True if the line is an include directive and the # filename has been seen and succesfully parsed. seen_filename = False while chunk_end < len(line): symbol_kind = match_symbol_kind_at(line, chunk_end) next = match_symbol_kind_at(line, chunk_end + 1) # Set include_line flag True as soon as a `#include` is detected. if match_include_command(tokens): include_line = True if in_comment: # If next characters end the comment... if symbol_kind == token_kinds.star and next == token_kinds.slash: in_comment = False chunk_start = chunk_end + 2 chunk_end = chunk_start # Otherwise, just skip one character. else: chunk_start = chunk_end + 1 chunk_end = chunk_start # If next characters start a comment, process previous chunk and set # in_comment to true. elif symbol_kind == token_kinds.slash and next == token_kinds.star: add_chunk(line[chunk_start:chunk_end], tokens) in_comment = True # If next two characters are //, we skip the rest of this line. elif symbol_kind == token_kinds.slash and next == token_kinds.slash: break # Skip spaces and process previous chunk. elif line[chunk_end].c.isspace(): add_chunk(line[chunk_start:chunk_end], tokens) chunk_start = chunk_end + 1 chunk_end = chunk_start # If this is an include line, and not a comment or whitespace, # expect the line to match an include filename. elif include_line: # If the filename has already been seen, there should be no more # tokens. if seen_filename: descrip = "extra tokens at end of include directive" raise CompilerError(descrip, line[chunk_end].r) filename, end = read_include_filename(line, chunk_end) tokens.append(Token(token_kinds.include_file, filename, r=Range(line[chunk_end].p, line[end].p))) chunk_start = end + 1 chunk_end = chunk_start seen_filename = True # If next character is a quote, we read the whole string as a token. elif symbol_kind in {token_kinds.dquote, token_kinds.squote}: if symbol_kind == token_kinds.dquote: quote_str = '"' kind = token_kinds.string add_null = True else: quote_str = "'" kind = token_kinds.char_string add_null = False chars, end = read_string(line, chunk_end + 1, quote_str, add_null) rep = chunk_to_str(line[chunk_end:end + 1]) r = Range(line[chunk_end].p, line[end].p) if kind == token_kinds.char_string and len(chars) == 0: err = "empty character constant" error_collector.add(CompilerError(err, r)) elif kind == token_kinds.char_string and len(chars) > 1: err = "multiple characters in character constant" error_collector.add(CompilerError(err, r)) tokens.append(Token(kind, chars, rep, r=r)) chunk_start = end + 1 chunk_end = chunk_start # If next character is another symbol, add previous chunk and then # add the symbol. elif symbol_kind: symbol_start_index = chunk_end symbol_end_index = chunk_end + len(symbol_kind.text_repr) - 1 r = Range(line[symbol_start_index].p, line[symbol_end_index].p) symbol_token = Token(symbol_kind, r=r) add_chunk(line[chunk_start:chunk_end], tokens) tokens.append(symbol_token) chunk_start = chunk_end + len(symbol_kind.text_repr) chunk_end = chunk_start # Include another character in the chunk. else: chunk_end += 1 # Flush out anything that is left in the chunk to the output add_chunk(line[chunk_start:chunk_end], tokens) # Catch a `#include` on a line by itself. if (include_line or match_include_command(tokens)) and not seen_filename: read_include_filename(line, chunk_end) return tokens, in_comment