def parse_macro(self): self.get_token(TokenTypes.LITERAL, "[") macro_name = self.get_token_value(TokenTypes.TEXT) self.get_token(TokenTypes.LITERAL, "]") self.get_token(TokenTypes.LITERAL, "(") raw = False if macro_name == "footnote": raw = True arguments = self.collect_join( stop_tokens=[Token(TokenTypes.LITERAL, ")"), Token(TokenTypes.EOL)], ) p = analyse(ArgumentsParser(raw=raw), arguments) self.get_token(TokenTypes.LITERAL, ")") if macro_name == "link": return self.parse_macro_link(args=p.args, kwargs=p.kwargs) if macro_name == "mailto": return self.parse_macro_mailto(args=p.args, kwargs=p.kwargs) elif macro_name == "image": return self.parse_macro_image(args=p.args, kwargs=p.kwargs) elif macro_name == "footnote": return self.parse_macro_footnote(args=p.args, kwargs=p.kwargs) return MacroNode(macro_name, args=p.args, kwargs=p.kwargs)
def parse_sentence(self, stop_tokens=None): content = [] stop_tokens = stop_tokens or set() stop_tokens = stop_tokens.union( {Token(TokenTypes.EOF), Token(TokenTypes.EOL)}) result = self.parse_styled_text(stop_tokens) while result is not None: content.append(result) result = self.parse_styled_text(stop_tokens) import itertools # Group consecutive WordNode nodes into a single TextNode grouped_nodes = [] for key, group in itertools.groupby(content, lambda x: x.__class__ == WordNode): if key: text = "".join([n.value for n in group]) grouped_nodes.append(TextNode(text)) else: grouped_nodes.extend(list(group)) return SentenceNode(content=grouped_nodes)
def test_token_equality_ignores_position(): assert Token("sometype", "somevalue", position=(12, 34)) == Token( "sometype", "somevalue" ) assert Token("sometype", "somevalue") == Token( "sometype", "somevalue", position=(12, 34) )
def _parse_list_nodes(self): # This parses all items of a list # Ignore initial white spaces with self: self.get_token(TokenTypes.WHITESPACE) # Parse the header and ignore the following white spaces header = self.get_token(TokenTypes.LITERAL, check=lambda x: x[0] in "*#").value self.get_token(TokenTypes.WHITESPACE) # Collect and parse the text of the item text = self._collect_text_content() content = self._parse_text_content(text) # Compute the level of the item level = len(header) nodes = [] nodes.append(ListItemNode(level, content)) while not self.peek_token() in [ Token(TokenTypes.EOF), Token(TokenTypes.EOL) ]: # This is the SentenceNode inside the last node added to the list # which is used to append potential nested nodes last_node_sentence = nodes[-1].content # Ignore the initial white spaces with self: self.get_token(TokenTypes.WHITESPACE) if len(self.peek_token().value) == level: # The new item is on the same level # Get the header header = self.get_token().value # Ignore white spaces self.get_token(TokenTypes.WHITESPACE) # Collect and parse the text of the item text = self._collect_text_content() content = self._parse_text_content(text) nodes.append(ListItemNode(len(header), content)) elif len(self.peek_token().value) > level: # The new item is on a deeper level # Treat the new line as a new list numbered = True if self.peek_token().value[0] == "#" else False subnodes = self._parse_list_nodes() last_node_sentence.content.append(ListNode(numbered, subnodes)) else: break return nodes
def parse_verbatim(self): self.get_token(TokenTypes.LITERAL, "`") content = self.collect_join( [Token(TokenTypes.LITERAL, "`"), Token(TokenTypes.EOL)], ) self.get_token(TokenTypes.LITERAL, "`") return VerbatimNode(content)
def _parse_paragraph(self): lines = self.collect_lines( [Token(TokenTypes.EOL), Token(TokenTypes.EOF)]) text = " ".join(lines) sentence = self._parse_text_content(text) args, kwargs = self._pop_attributes() self._save(ParagraphNode(sentence, args=args, kwargs=kwargs))
def _parse_multi_line_comment(self): # //// # A comment # on multiple lines # //// self.get_token(TokenTypes.LITERAL, "////") self._collect_lines( [Token(TokenTypes.LITERAL, "////"), Token(TokenTypes.EOF)]) self.force_token(TokenTypes.LITERAL, "////")
def _parse_verbatim(self): self.get_token(TokenTypes.LITERAL, "`") text = self.collect_join( [Token(TokenTypes.LITERAL, "`"), Token(TokenTypes.EOF)], preserve_escaped_stop_tokens=True, ) self.get_token(TokenTypes.LITERAL, "`") text = f"`{text}`" self._save(TextNode(text))
def parse_class(self): self.get_token(TokenTypes.LITERAL, "[") classes = self.collect_join( [Token(TokenTypes.LITERAL, "]"), Token(TokenTypes.EOL)] ) self.get_token(TokenTypes.LITERAL, "]") self.get_token(TokenTypes.LITERAL, "#") content = self.parse_sentence(stop_tokens={Token(TokenTypes.LITERAL, "#")}) self.get_token(TokenTypes.LITERAL, "#") classes = classes.split(",") return ClassNode(classes, content)
def _parse_unnamed_argument(self): # This parses a named argument # in the form value or "value" # Values can be surrounded by quotes if self.peek_token_is(TokenTypes.LITERAL, '"'): self.get_token(TokenTypes.LITERAL, '"') value = self.collect_join([Token(TokenTypes.LITERAL, '"')]) self.get_token(TokenTypes.LITERAL, '"') else: value = self.collect_join( [Token(TokenTypes.LITERAL, ","), Token(TokenTypes.EOF)]) return value
def _parse_block(self): delimiter = self.get_token(TokenTypes.TEXT).value if len(delimiter) != 4 or len(set(delimiter)) != 1: raise TokenError self.get_token(TokenTypes.EOL) content = self.collect_lines( [Token(TokenTypes.TEXT, delimiter), Token(TokenTypes.EOF)]) self.force_token(TokenTypes.TEXT, delimiter) self.get_token(TokenTypes.EOL) secondary_content = self.collect_lines( [Token(TokenTypes.EOL), Token(TokenTypes.EOF)]) args, kwargs = self._pop_attributes() title = self._pop_title() if len(args) != 0 and args[0] in ["if", "ifnot"]: return self._parse_conditional_block(args[0], content, args[1:], kwargs) if len(args) != 0 and args[0] in ["raw"]: return self._parse_raw_block(content, args[1:], kwargs) if len(args) != 0 and args[0] == "source": return self._parse_source_block(content, secondary_content, title, args[1:], kwargs) if len(args) != 0 and args[0] == "admonition": return self._parse_admonition_block(content, args[1:], kwargs) if len(args) != 0 and args[0] == "quote": return self._parse_quote_block(content, title, args[1:], kwargs) try: blocktype = args[0] args = args[1:] except IndexError: blocktype = None return self._parse_standard_block(blocktype, content, secondary_content, title, args, kwargs)
def _parse_paragraph(self): # This parses a paragraph. # Paragraphs can be written on multiple lines and # end with an empty line. # Get all the lines, join them and parse them lines = self._collect_lines( [Token(TokenTypes.EOL), Token(TokenTypes.EOF)]) text = " ".join(lines) sentence = self._parse_text_content(text) # Consume the attributes args, kwargs = self.argsparser.get_arguments_and_reset() self._save(ParagraphNode(sentence, args=args, kwargs=kwargs))
def force_token(self, ttype, tvalue=None): try: return self.get_token(ttype, tvalue) except TokenError: raise ExpectedError({ "expected": Token(ttype, tvalue), "found": self.current_token })
def test_token_accepts_text_position(): line = 456 column = 123 t = Token("sometype", "somevalue", position=(line, column)) assert t.type == "sometype" assert t.value == "somevalue" assert t.position == (line, column)
def parse_style(self): style = self.get_token_value(TokenTypes.LITERAL, check=lambda x: x in "*_") content = self.parse_sentence( stop_tokens={Token(TokenTypes.LITERAL, style)}) self.get_token(TokenTypes.LITERAL, style) return StyleNode(MAP_STYLES[style], content)
def _parse_unnamed_argument(self): if self.peek_token_is(TokenTypes.LITERAL, '"'): self.get_token(TokenTypes.LITERAL, '"') value = self.collect_join([Token(TokenTypes.LITERAL, '"')]) self.get_token(TokenTypes.LITERAL, '"') else: value = self.get_token(TokenTypes.TEXT).value return value
def test_check_current_token_with_function(): p = init_parser("\n") p.get_token() assert p.check_current_token( TokenTypes.EOL, check=lambda x: x is None) == Token(TokenTypes.EOL) # check_current_token doesn't advance the index assert p.get_token() == EOL assert p.get_token() == EOF
def _parse_curly(self): variable_name = [] self.get_token(TokenTypes.LITERAL, "{") variable_name = self.collect_join( [Token(TokenTypes.LITERAL, "}"), Token(TokenTypes.EOF)] ) self.get_token(TokenTypes.LITERAL, "}") try: if "." not in variable_name: variable_value = self.variables[variable_name] else: namespace, variable_name = variable_name.split(".") variable_value = self.variables[namespace][variable_name] self._save(TextNode(variable_value)) except KeyError: raise PreprocessError(f'Attribute "{variable_name}" has not been defined')
def _parse_single_argument(self): if self.raw: value = self.collect_join([Token(TokenTypes.EOL), Token(TokenTypes.EOF)]) self.args.append(value) return with self: name, value = self._parse_named_argument() self.kwargs[name] = value self._named_arguments = True return if self._named_arguments: raise ParseError("Unnamed arguments after named arguments are forbidden") with self: value = self._parse_unnamed_argument() self.args.append(value) return
def collect_lines(self, stop_tokens): # This collects several lines of text in a list # until it gets to a line that begins with one # of the tokens listed in stop_tokens. lines = [] while self.peek_token() not in stop_tokens: lines.append(self.collect_join([Token(TokenTypes.EOL)])) self.get_token(TokenTypes.EOL) return lines
def peek_token(self, ttype=None, tvalue=None, check=None): """ Return the next token without advancing the index. """ try: token = self.tokens[self.index + 1] return self._check_token(token, ttype, tvalue, check) except IndexError: return Token(TokenTypes.EOF)
def get_token(self, ttype=None, tvalue=None, check=None): """ Return the next token and advances the index. The token is stored it in current_token. """ if self.index == len(self.tokens): return Token(TokenTypes.EOF) self.index += 1 return self._check_token(self.current_token, ttype, tvalue, check)
def get_token(self, ttype=None, tvalue=None, check=None): """ Return the next token and advances the index. This function returns the next token and then advances the index, and can optionally check its type or value (see _check_token). The token is stored it in self._current_token. """ if self.index == len(self.tokens): return Token(TokenTypes.EOF) self.index += 1 return self._check_token(self.current_token, ttype, tvalue, check)
def _parse_list_nodes(self): with self: self.get_token(TokenTypes.WHITESPACE) header = self.get_token(TokenTypes.LITERAL, check=lambda x: x[0] in "*#").value self.get_token(TokenTypes.WHITESPACE) text = self._collect_text_content() content = self._parse_text_content(text) level = len(header) nodes = [] nodes.append(ListItemNode(level, content)) while not self.peek_token() in [ Token(TokenTypes.EOF), Token(TokenTypes.EOL) ]: # This is the SentenceNode inside the last node added to the list last_node_sentence = nodes[-1].content with self: self.get_token(TokenTypes.WHITESPACE) if len(self.peek_token().value) == level: header = self.get_token().value self.get_token(TokenTypes.WHITESPACE) text = self._collect_text_content() content = self._parse_text_content(text) nodes.append(ListItemNode(len(header), content)) elif len(self.peek_token().value) > level: numbered = True if self.peek_token().value[0] == "#" else False subnodes = self._parse_list_nodes() last_node_sentence.content.append(ListNode(numbered, subnodes)) else: break return nodes
def _collect_lines(self, stop_tokens): # This collects several lines of text in a list # until it gets to a line that begins with one # of the tokens listed in stop_tokens. # It is useful for block or other elements that # are clearly surrounded by delimiters. lines = [] while self.peek_token() not in stop_tokens: lines.append(self.collect_join([Token(TokenTypes.EOL)])) self.get_token(TokenTypes.EOL) return lines
def force_token(self, ttype, tvalue=None): """ Return the next token and advances the index, but forces the token to have a specific type and optionally a value. If the token doesn't match the provided values the function raises an ExpectedError """ try: return self.get_token(ttype, tvalue) except TokenError: raise ExpectedError( {"expected": Token(ttype, tvalue), "found": self.current_token} )
def current_token(self): """ Returns the token being parsed. We often need to know which token we are currently parsing, but we might already have parsed all of them, so this convenience method wraps the possible index error. """ if self.index < 0: raise ValueError("The parser has no current token") try: return self.tokens[self.index] except IndexError: return Token(TokenTypes.EOF)
def _parse_variable_definition(self): # This parses a variable definition # # Simple variables are defined as :name:value # as True booleans as just :name: # and as False booleas as :!name: # # Variable names can use a namespace with # :namespace.name:value # Get the mandatory variable name self.get_token(TokenTypes.LITERAL, ":") variable_name = self.get_token(TokenTypes.TEXT).value self.get_token(TokenTypes.LITERAL, ":") # Assume the variable is a flag variable_value = True # If the name starts with ! it's a false flag if variable_name.startswith("!"): variable_value = False variable_name = variable_name[1:] # Get the optional value value = self.collect_join([Token(TokenTypes.EOL)]) # The value is assigned only if the variable # is not a negative flag. In that case it is ignored if variable_value and len(value) > 0: variable_value = value # If the variable name contains a dot we # want to use a namespace if "." not in variable_name: self.variables[variable_name] = variable_value else: # Let's ignore all others dots namespace, variable_name = variable_name.split(".", maxsplit=1) # This defines the namespace if it's not already there try: self.variables[namespace][variable_name] = variable_value except KeyError: self.variables[namespace] = {variable_name: variable_value}
def _parse_variable_definition(self): self.get_token(TokenTypes.LITERAL, ":") variable_name = self.get_token(TokenTypes.TEXT).value self.get_token(TokenTypes.LITERAL, ":") variable_value = True if variable_name.startswith("!"): variable_value = False variable_name = variable_name[1:] value = self.collect_join([Token(TokenTypes.EOL)]) if len(value) > 0: variable_value = value if "." not in variable_name: self.variables[variable_name] = variable_value else: namespace, variable_name = variable_name.split(".") try: self.variables[namespace][variable_name] = variable_value except KeyError: self.variables[namespace] = {variable_name: variable_value}
def _parse_block(self): # Parse a block in the form # # [block_type] # ---- # Content # ---- # Optional secondary content # # Blocks are delimited by 4 consecutive identical characters. # Get the delimiter and check the length delimiter = self.get_token(TokenTypes.TEXT).value if len(delimiter) != 4 or len(set(delimiter)) != 1: raise TokenError self.get_token(TokenTypes.EOL) # Collect everything until the next delimiter content = self._collect_lines( [Token(TokenTypes.TEXT, delimiter), Token(TokenTypes.EOF)]) self.force_token(TokenTypes.TEXT, delimiter) self.get_token(TokenTypes.EOL) # Get the optional secondary content secondary_content = self._collect_lines( [Token(TokenTypes.EOL), Token(TokenTypes.EOF)]) # Consume the title title = self._pop_title() # The first unnamed argument is the block type blocktype = self.argsparser.pop() # If there is a block alias for blocktype replace it # otherwise use the blocktype we already have blocktype = self.block_aliases.get(blocktype, blocktype) # Assign names self.argsparser.set_names_and_defaults( self.block_names.get(blocktype, []), self.block_defaults.get(blocktype, {})) # Consume the attributes args, kwargs = self.argsparser.get_arguments_and_reset() # Extract classes and convert them into a list classes = [ i for i in kwargs.pop("classes", "").split(",") if len(i) > 0 ] # Extract condition if present and process it condition = kwargs.pop("condition", "") # Run this only if there is a condition on this block if len(condition) > 0: try: # The condition should be either test:variable:value or test:variable: test, variable, value = condition.split(":") except ValueError: self.error( f'Condition {condition} is not in the form "test:variable:value" or "test:variable:' ) # If there is no value use True if len(value) == 0: value = True # Check if the variable matches the value and apply the requested test match = self.variables.get(variable) == value result = True if test == "if" else False # If the condition is not satisfied return if match is not result: return # Extract the preprocessor preprocessor = kwargs.pop("preprocessor", "none") # Extract the engine engine = kwargs.pop("engine", "default") # Create the node parameters according to the engine if engine in ["raw", "mau"]: # Engine "raw" doesn't process the content, # so we just pass it untouched in the form of # a TextNode per line. The same is true for "mau" # as the visitor will have to fire up an new parser # to process the content. content = [TextNode(line) for line in content] secondary_content = [TextNode(line) for line in secondary_content] elif engine == "source": # Engine "source" extracts the content (source code), # the callouts, and the highlights. # The default language is "text". content, callouts, highlights = self._parse_source_engine( content, secondary_content, kwargs) secondary_content = [] kwargs["callouts"] = callouts kwargs["highlights"] = highlights kwargs["language"] = kwargs.get("language", "text") elif engine == "default": # This is the default engine and it parses # both content and secondary content using a new parser # but then merges headers and footnotes into the # current one. # Parse the primary and secondary content and record footnotes pc = MainParser(variables=self.variables).analyse( "\n".join(content)) ps = MainParser(variables=self.variables).analyse( "\n".join(secondary_content)) content = pc.nodes secondary_content = ps.nodes self.footnote_defs.extend(pc.footnote_defs) self.headers.extend(pc.headers) else: raise EngineError(f"Engine {engine} is not available") self._save( BlockNode( blocktype=blocktype, content=content, secondary_content=secondary_content, args=args, classes=classes, engine=engine, preprocessor=preprocessor, kwargs=kwargs, title=title, ))