def consume(cls, ctx, tokens, breakset=None): """ Consume tokens and return a tree of nodes. Top-level consumer parsers comments, whitespace, statements, and flow control blocks. """ if breakset is None: breakset = () tree = cls() blocks = tree.children while tokens: token = tokens[0] if token.type in WHITESPACE_TOKENS: node = WhitespaceNode.consume(ctx, tokens) blocks.append(node) elif token.type in COMMENT_TOKENS: node = CommentNode.consume(ctx, tokens) blocks.append(node) elif token.type in ONOFF_TOKENS: node = OnOffNode.consume(ctx, tokens) blocks.append(node) elif token.type == lexer.TokenType.BRACKET_COMMENT: node = CommentNode.consume(ctx, tokens) blocks.append(node) elif token.type == lexer.TokenType.WORD: upper = token.spelling.upper() if upper in breakset: return tree if FlowType.get(upper) is not None: subtree = FlowControlNode.consume(ctx, tokens) blocks.append(subtree) else: subtree = StatementNode.consume(ctx, tokens) blocks.append(subtree) elif token.type == lexer.TokenType.ATWORD: subtree = AtWordStatementNode.consume(ctx, tokens) blocks.append(subtree) elif token.type == lexer.TokenType.BYTEORDER_MARK: tokens.pop(0) else: raise InternalError("Unexpected {} token at {}:{}".format( tokens[0].type.name, tokens[0].begin.line, tokens[0].begin.col)) return tree
def consume(cls, ctx, tokens): """ Consume a complete statement, removing tokens from the input list and returning a STATEMENT node. """ node = cls() # Consume the function name fnname = tokens[0].spelling.lower() node.funnode = funnode = FunctionNameNode.parse(ctx, tokens) node.children.append(funnode) # Consume whitespace up to the parenthesis while tokens and tokens[0].type in WHITESPACE_TOKENS: node.children.append(tokens.pop(0)) # TODO(josh): should the parens belong to the statement node or the # group node? if tokens[0].type != lexer.TokenType.LEFT_PAREN: raise ValueError( "Unexpected {} token at {}, expecting l-paren, got {}".format( tokens[0].type.name, tokens[0].get_location(), repr(tokens[0].content))) lparen = TreeNode(NodeType.LPAREN) lparen.children.append(tokens.pop(0)) node.children.append(lparen) while tokens and tokens[0].type in WHITESPACE_TOKENS: node.children.append(tokens.pop(0)) continue breakstack = [ParenBreaker()] parse_fun = ctx.parse_db.get(fnname, None) if parse_fun is None: # If the parse_db provides a "_default" then use that. Otherwise use the # standard parser with no kwargs or flags. parse_fun = ctx.parse_db.get("_default", StandardParser()) node.argtree = subtree = parse_fun(ctx, tokens, breakstack) node.children.append(subtree) # NOTE(josh): technically we may have a statement specification with # an exact number of arguments. At this point we have broken out of that # statement but we might have some comments or whitespace to consume while tokens and tokens[0].type != lexer.TokenType.RIGHT_PAREN: if tokens[0].type in WHITESPACE_TOKENS: node.children.append(tokens.pop(0)) continue if tokens[0].type in COMMENT_TOKENS: cnode = CommentNode.consume(ctx, tokens) node.children.append(cnode) continue raise UserError( "Unexpected {} token at {}, expecting r-paren, got {}".format( tokens[0].type.name, tokens[0].get_location(), repr(tokens[0].content))) if not tokens: raise UserError( "Unexpected end of token stream while parsing statement:\n {}". format(tree_string([node]))) if tokens[0].type != lexer.TokenType.RIGHT_PAREN: raise UserError( "Unexpected {} token at {}, expecting r-paren, got {}".format( tokens[0].type.name, tokens[0].get_location(), repr(tokens[0].content))) rparen = TreeNode(NodeType.RPAREN) rparen.children.append(tokens.pop(0)) node.children.append(rparen) CommentNode.consume_trailing(ctx, tokens, node) return node
def parse(cls, ctx, tokens, npargs, flags, breakstack, sortable=False): """ Parse a continuous sequence of `npargs` positional arguments. If npargs is an integer we will consume exactly that many arguments. If it is not an integer then it is a string meaning: * "?": zero or one * "*": zero or more * "+": one or more """ tree = cls(sortable=sortable) tree.spec = PositionalSpec(npargs, flags) nconsumed = 0 # Strip off any preceeding whitespace (note that in most cases this has # already been done but in some cases (such ask kwarg subparser) where # it hasn't while tokens and tokens[0].type in WHITESPACE_TOKENS: tree.children.append(tokens.pop(0)) # If the first non-whitespace token is a cmake-format tag annotating # sortability, then parse it out here and record the annotation if tokens and get_tag(tokens[0]) in ("sortable", "sort"): tree.sortable = True elif tokens and get_tag(tokens[0]) in ("unsortable", "unsort"): tree.sortable = False while tokens: # Break if we have consumed enough positional arguments if pargs_are_full(npargs, nconsumed): break # Break if the next token belongs to a parent parser, i.e. if it # matches a keyword argument of something higher in the stack, or if # it closes a parent group. if should_break(tokens[0], breakstack): # NOTE(josh): if npargs is an exact number of arguments, then we # shouldn't break on kwarg match from a parent parser. Instead, we # should consume the token. This is a hack to deal with # ```install(RUNTIME COMPONENT runtime)``. In this case the second # occurance of "runtime" should not match the ``RUNTIME`` keyword # and should not break the positional parser. # TODO(josh): this is kind of hacky because it will force the positional # parser to consume a right parenthesis and will lead to parse errors # in the event of a missing positional argument. Such errors will be # difficult to debug for the user. if not npargs_is_exact(npargs): break if tokens[0].type == lexer.TokenType.RIGHT_PAREN: break # If this is the start of a parenthetical group, then parse the group # NOTE(josh): syntatically this probably shouldn't be allowed here, but # cmake seems to accept it so we probably should too. if tokens[0].type == lexer.TokenType.LEFT_PAREN: subtree = ParenGroupNode.parse(ctx, tokens, breakstack) tree.children.append(subtree) continue # If it is a whitespace token then put it directly in the parse tree at # the current depth if tokens[0].type in WHITESPACE_TOKENS: tree.children.append(tokens.pop(0)) continue # If it's a comment token not associated with an argument, then put it # directly into the parse tree at the current depth if tokens[0].type in (lexer.TokenType.COMMENT, lexer.TokenType.BRACKET_COMMENT): before = len(tokens) child = CommentNode.consume(ctx, tokens) assert len(tokens) < before, \ "consume_comment didn't consume any tokens" tree.children.append(child) continue # Otherwise is it is a positional argument, so add it to the tree as such if get_normalized_kwarg(tokens[0]) in flags: child = TreeNode(NodeType.FLAG) else: child = TreeNode(NodeType.ARGUMENT) child.children.append(tokens.pop(0)) CommentNode.consume_trailing(ctx, tokens, child) tree.children.append(child) nconsumed += 1 return tree
def parse2(cls, ctx, tokens, cmdspec, kwargs, breakstack): """ Standard parser for the commands in the form of:: command_name(parg1 parg2 parg3... KEYWORD1 kwarg1 kwarg2... KEYWORD2 kwarg3 kwarg4... FLAG1 FLAG2 FLAG3) The parser starts off as a positional parser. If a keyword or flag is encountered the positional parser is popped off the parse stack. If it was a keyword then the keyword parser is pushed on the parse stack. If it was a flag than a new flag parser is pushed onto the stack. """ # NOTE(josh): we will pop things off this list, so let's make a copy pargspecs = list(cmdspec.pargs) tree = cls() tree.cmdspec = cmdspec # If it is a whitespace token then put it directly in the parse tree at # the current depth while tokens and tokens[0].type in WHITESPACE_TOKENS: tree.children.append(tokens.pop(0)) continue # NOTE(josh): if there is only one non-exact legacy specification then we # reuse that specification for any additional positional arguments that we # pick up. This is to maintain the current/legacy behavior of simple # positional argument specifications # TODO(josh): double check the reasoning for this. I think it might be # mistaken and unnecessary default_spec = DEFAULT_PSPEC if (len(pargspecs) == 1 and pargspecs[0].legacy and not npargs_is_exact(pargspecs[0].nargs)): default_spec = pargspecs.pop(0) all_flags = list(default_spec.flags) for pspec in pargspecs: all_flags.extend(pspec.flags) kwarg_breakstack = breakstack + [ KwargBreaker(list(kwargs.keys()) + all_flags) ] while tokens: # If it is a whitespace token then put it directly in the parse tree at # the current depth if tokens[0].type in WHITESPACE_TOKENS: tree.children.append(tokens.pop(0)) continue # If it's a comment, then add it at the current depth if tokens[0].type in (lexer.TokenType.COMMENT, lexer.TokenType.BRACKET_COMMENT): if comment_belongs_up_tree(ctx, tokens, tree, breakstack): break tree.children.append(CommentNode.consume(ctx, tokens)) continue # If it's a sentinel comment, then add it at the current depth if tokens[0].type in (lexer.TokenType.FORMAT_OFF, lexer.TokenType.FORMAT_ON): tree.children.append(OnOffNode.consume(ctx, tokens)) continue # Break if the next token belongs to a parent parser, i.e. if it # matches a keyword argument of something higher in the stack, or if # it closes a parent group. if should_break(tokens[0], breakstack): # NOTE(josh): if spec.nargs is an exact number of arguments, then we # shouldn't break on kwarg match from a parent parser. Instead, we # should consume that many tokens. This is a hack to deal with # ```install(RUNTIME COMPONENT runtime)``. In this case the second # occurance of "runtime" should not match the ``RUNTIME`` keyword # and should not break the positional parser. # TODO(josh): this is kind of hacky because it will force the positional # parser to consume a right parenthesis and will lead to parse errors # in the event of a missing positional argument. Such errors will be # difficult to debug for the user. if pargspecs: pspec = pargspecs[0] else: pspec = default_spec if not npargs_is_exact(pspec.nargs) or pspec.nargs == 0: break ntokens = len(tokens) word = get_normalized_kwarg(tokens[0]) if word in kwargs: with ctx.pusharg(tree): subtree = KeywordGroupNode.parse(ctx, tokens, word, kwargs[word], kwarg_breakstack) tree.kwarg_groups.append(subtree) else: if pargspecs: pspec = pargspecs.pop(0) else: pspec = default_spec other_flags = [] for otherspec in pargspecs: for flag in otherspec.flags: if flag in pspec.flags: continue other_flags.append(flag) positional_breakstack = breakstack + [ KwargBreaker(list(kwargs.keys()) + other_flags) ] with ctx.pusharg(tree): subtree = PositionalGroupNode.parse2( ctx, tokens, pspec, positional_breakstack) tree.parg_groups.append(subtree) if len(tokens) >= ntokens: raise InternalError( "parsed an empty subtree at {}:\n {}\n pspec: {}".format( tokens[0], dump_tree_tostr([tree]), pspec)) tree.children.append(subtree) return tree
def parse2(cls, ctx, tokens, pargspecs, kwargs, breakstack): """ Standard parser for the commands in the form of:: command_name(parg1 parg2 parg3... KEYWORD1 kwarg1 kwarg2... KEYWORD2 kwarg3 kwarg4... FLAG1 FLAG2 FLAG3) The parser starts off as a positional parser. If a keyword or flag is encountered the positional parser is popped off the parse stack. If it was a keyword then the keyword parser is pushed on the parse stack. If it was a flag than a new flag parser is pushed onto the stack. """ tree = cls() # If it is a whitespace token then put it directly in the parse tree at # the current depth while tokens and tokens[0].type in WHITESPACE_TOKENS: tree.children.append(tokens.pop(0)) continue # NOTE(josh): if there is only one legacy specification then we reuse that # specification for any additional positional arguments that we pick up. # This is to maintain the current/legacy behavior of simple positional # argument specifications default_spec = DEFAULT_PSPEC if len(pargspecs) == 1 and pargspecs[0].legacy: default_spec = pargspecs.pop(0) all_flags = list(default_spec.flags) for pspec in pargspecs: all_flags.extend(pspec.flags) kwarg_breakstack = breakstack + [ KwargBreaker(list(kwargs.keys()) + all_flags)] while tokens: # Break if the next token belongs to a parent parser, i.e. if it # matches a keyword argument of something higher in the stack, or if # it closes a parent group. if should_break(tokens[0], breakstack): break # If it is a whitespace token then put it directly in the parse tree at # the current depth if tokens[0].type in WHITESPACE_TOKENS: tree.children.append(tokens.pop(0)) continue # If it's a comment, then add it at the current depth if tokens[0].type in (lexer.TokenType.COMMENT, lexer.TokenType.BRACKET_COMMENT): if comment_belongs_up_tree(ctx, tokens, tree, breakstack): break tree.children.append(CommentNode.consume(ctx, tokens)) continue # If it's a sentinel comment, then add it at the current depth if tokens[0].type in (lexer.TokenType.FORMAT_OFF, lexer.TokenType.FORMAT_ON): tree.children.append(OnOffNode.consume(ctx, tokens)) continue ntokens = len(tokens) word = get_normalized_kwarg(tokens[0]) if word in kwargs: with ctx.pusharg(tree): subtree = KeywordGroupNode.parse( ctx, tokens, word, kwargs[word], kwarg_breakstack) tree.kwarg_groups.append(subtree) else: if pargspecs: pspec = pargspecs.pop(0) else: pspec = default_spec other_flags = [] for otherspec in pargspecs: for flag in otherspec.flags: if flag in pspec.flags: continue other_flags.append(flag) positional_breakstack = breakstack + [ KwargBreaker(list(kwargs.keys()) + other_flags)] with ctx.pusharg(tree): subtree = PositionalGroupNode.parse( ctx, tokens, pspec.nargs, pspec.flags, positional_breakstack) subtree.tags.extend(pspec.tags) tree.parg_groups.append(subtree) assert len(tokens) < ntokens, "parsed an empty subtree" tree.children.append(subtree) return tree