Exemplo n.º 1
0
def add_chunk(chunk, tokens):
    """Convert chunk into a token if possible and add to tokens.

    If chunk is non-empty but cannot be made into a token, this function
    records a compiler error. We don't need to check for symbol kind tokens
    here because they are converted before they are shifted into the chunk.

    chunk - Chunk to convert into a token, as list of Tagged characters.
    tokens (List[Token]) - List of the tokens thusfar parsed.

    """
    if chunk:
        range = Range(chunk[0].p, chunk[-1].p)

        keyword_kind = match_keyword_kind(chunk)
        if keyword_kind:
            tokens.append(Token(keyword_kind, r=range))
            return

        number_string = match_number_string(chunk)
        if number_string:
            tokens.append(Token(token_kinds.number, number_string, r=range))
            return

        identifier_name = match_identifier_name(chunk)
        if identifier_name:
            tokens.append(Token(
                token_kinds.identifier, identifier_name, r=range))
            return

        descrip = "unrecognized token at '{}'".format(chunk_to_str(chunk))
        raise CompilerError(descrip, range)
Exemplo n.º 2
0
    def __init__(self, message, index, tokens, message_type):
        """Initialize a ParserError from the given arguments.

        message (str) - Base message to put in the error.
        tokens (List[Token]) - List of tokens.
        index (int) - Index of the offending token.
        message_type (int) - One of self.AT, self.GOT, or self.AFTER.

        Example:
            ParserError("unexpected semicolon", 10, [...], self.AT)
               -> CompilerError("unexpected semicolon at ';'", ..., ...)
               -> "main.c:10: unexpected semicolon at ';'"
        """
        self.amount_parsed = index

        if len(tokens) == 0:
            super().__init__("{} at beginning of source".format(message))
            return

        # If the index is too big, we're always using the AFTER form
        if index >= len(tokens):
            index = len(tokens)
            message_type = self.AFTER
        # If the index is too small, we should not use the AFTER form
        elif index <= 0:
            index = 0
            if message_type == self.AFTER:
                message_type = self.GOT

        if message_type == self.AT:
            super().__init__("{} at '{}'".format(message, tokens[index]),
                             tokens[index].r)
        elif message_type == self.GOT:
            super().__init__("{}, got '{}'".format(message, tokens[index]),
                             tokens[index].r)
        elif message_type == self.AFTER:
            if tokens[index - 1].r:
                new_range = Range(tokens[index - 1].r.end + 1)
            else:
                new_range = None

            super().__init__(
                "{} after '{}'".format(message, tokens[index - 1]), new_range)
Exemplo n.º 3
0
 def __init__(self, c, p):
     """Initialize object."""
     self.c = c
     self.p = p
     self.r = Range(p, p)
Exemplo n.º 4
0
def tokenize_line(line, in_comment):
    """Tokenize the given single line.

    line - List of Tagged objects.
    in_comment - Whether the first character in this line is part of a
    C-style comment body.
    return - List of Token objects, and boolean indicating whether the next
    character is part of a comment body.
    """
    tokens = []

    # line[chunk_start:chunk_end] is the section of the line currently
    # being considered for conversion into a token; this string will be
    # called the 'chunk'. Everything before the chunk has already been
    # tokenized, and everything after has not yet been examined
    chunk_start = 0
    chunk_end = 0

    # Flag that is set True if the line begins with `#` and `include`,
    # perhaps with comments and whitespace in between.
    include_line = False
    # Flag that is set True if the line is an include directive and the
    # filename has been seen and succesfully parsed.
    seen_filename = False

    while chunk_end < len(line):
        symbol_kind = match_symbol_kind_at(line, chunk_end)
        next = match_symbol_kind_at(line, chunk_end + 1)

        # Set include_line flag True as soon as a `#include` is detected.
        if match_include_command(tokens):
            include_line = True

        if in_comment:
            # If next characters end the comment...
            if symbol_kind == token_kinds.star and next == token_kinds.slash:
                in_comment = False
                chunk_start = chunk_end + 2
                chunk_end = chunk_start
            # Otherwise, just skip one character.
            else:
                chunk_start = chunk_end + 1
                chunk_end = chunk_start

        # If next characters start a comment, process previous chunk and set
        # in_comment to true.
        elif symbol_kind == token_kinds.slash and next == token_kinds.star:
            add_chunk(line[chunk_start:chunk_end], tokens)
            in_comment = True

        # If next two characters are //, we skip the rest of this line.
        elif symbol_kind == token_kinds.slash and next == token_kinds.slash:
            break

        # Skip spaces and process previous chunk.
        elif line[chunk_end].c.isspace():
            add_chunk(line[chunk_start:chunk_end], tokens)
            chunk_start = chunk_end + 1
            chunk_end = chunk_start

        # If this is an include line, and not a comment or whitespace,
        # expect the line to match an include filename.
        elif include_line:

            # If the filename has already been seen, there should be no more
            # tokens.
            if seen_filename:
                descrip = "extra tokens at end of include directive"
                raise CompilerError(descrip, line[chunk_end].r)

            filename, end = read_include_filename(line, chunk_end)
            tokens.append(Token(token_kinds.include_file, filename,
                                r=Range(line[chunk_end].p, line[end].p)))

            chunk_start = end + 1
            chunk_end = chunk_start
            seen_filename = True

        # If next character is a quote, we read the whole string as a token.
        elif symbol_kind in {token_kinds.dquote, token_kinds.squote}:
            if symbol_kind == token_kinds.dquote:
                quote_str = '"'
                kind = token_kinds.string
                add_null = True
            else:
                quote_str = "'"
                kind = token_kinds.char_string
                add_null = False

            chars, end = read_string(line, chunk_end + 1, quote_str, add_null)
            rep = chunk_to_str(line[chunk_end:end + 1])
            r = Range(line[chunk_end].p, line[end].p)

            if kind == token_kinds.char_string and len(chars) == 0:
                err = "empty character constant"
                error_collector.add(CompilerError(err, r))
            elif kind == token_kinds.char_string and len(chars) > 1:
                err = "multiple characters in character constant"
                error_collector.add(CompilerError(err, r))

            tokens.append(Token(kind, chars, rep, r=r))

            chunk_start = end + 1
            chunk_end = chunk_start

        # If next character is another symbol, add previous chunk and then
        # add the symbol.
        elif symbol_kind:
            symbol_start_index = chunk_end
            symbol_end_index = chunk_end + len(symbol_kind.text_repr) - 1

            r = Range(line[symbol_start_index].p, line[symbol_end_index].p)
            symbol_token = Token(symbol_kind, r=r)

            add_chunk(line[chunk_start:chunk_end], tokens)
            tokens.append(symbol_token)

            chunk_start = chunk_end + len(symbol_kind.text_repr)
            chunk_end = chunk_start

        # Include another character in the chunk.
        else:
            chunk_end += 1

    # Flush out anything that is left in the chunk to the output
    add_chunk(line[chunk_start:chunk_end], tokens)

    # Catch a `#include` on a line by itself.
    if (include_line or match_include_command(tokens)) and not seen_filename:
        read_include_filename(line, chunk_end)

    return tokens, in_comment