def parse_markdown( self, text: str, parent: Optional[nodes.Node] = None ) -> List[nodes.Node]: """Parse text as CommonMark, in a new document.""" parser = default_parser(MdParserConfig(commonmark_only=True)) # setup parent node if parent is None: parent = nodes.container() self.add_source_and_line(parent) parser.options["current_node"] = parent # setup containing document new_doc = make_document(self.node.source) new_doc.settings = self.document.settings new_doc.reporter = self.document.reporter parser.options["document"] = new_doc # use the node docname, where possible, to deal with single document builds with mock.patch.dict( self.env.temp_data, {"docname": self.env.path2doc(self.node.source)} ): parser.render(text) # TODO is there any transforms we should retroactively carry out? return parent.children
def parse( self, inputstring: str, document: nodes.document, ): """ Parse source text. Args: inputstring: The source string to parse document: The root docutils node to add AST elements to """ try: config = document.settings.env.myst_config except Exception: config = MdParserConfig(renderer="docutils") parser = default_parser(config) parser.options["document"] = document env = AttrDict() tokens = parser.parse(inputstring, env) if not tokens or tokens[0].type != "front_matter": # we always add front matter, so that we can merge it with global keys, # specified in the sphinx configuration tokens = [ Token( type="front_matter", tag="", nesting=0, content="{}", # noqa: P103 map=[0, 0], ), ] + tokens parser.renderer.render(tokens, parser.options, env)
def print_anchors(args=None): """ """ parser = argparse.ArgumentParser() parser.add_argument( "input", nargs="?", type=argparse.FileType("r"), default=sys.stdin, help="Input file (default stdin)", ) parser.add_argument( "-o", "--output", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)", ) parser.add_argument("-l", "--level", type=int, default=2, help="Maximum heading level.") args = parser.parse_args(args) parser = default_parser( MdParserConfig(renderer="html", heading_anchors=args.level)) def _filter_plugin(state): state.tokens = [ t for t in state.tokens if t.type.startswith("heading_") and int(t.tag[1]) <= args.level ] parser.use(lambda p: p.core.ruler.push("filter", _filter_plugin)) text = parser.render(args.input.read()) args.output.write(text)
def matches_mystnb( text, ext=None, requires_meta=True, code_directive=CODE_DIRECTIVE, raw_directive=RAW_DIRECTIVE, ): """Attempt to distinguish a file as myst, only given its extension and content. :param ext: the extension of the file :param requires_meta: requires the file to contain top matter metadata :param code_directive: the name of the directive to search for containing code cells :param raw_directive: the name of the directive to search for containing raw cells """ # is the extension uniquely associated with myst (i.e. not just .md) if ext and "." + ("." + ext).rsplit(".", 1)[1] in myst_extensions(no_md=True): return True # might the text contain metadata front matter if requires_meta and not text.startswith("---"): return False try: # parse markdown file up to the block level (i.e. don't worry about inline text) parser = default_parser("html", disable_syntax=["inline"]) tokens = parser.parse(text + "\n") except (TypeError, ValueError) as err: warnings.warn("myst-parse failed unexpectedly: {}".format(err)) return False # Is the format information available in the jupytext text representation? if tokens and tokens[0].type == "front_matter": try: metadata = yaml.safe_load(tokens[0].content) except (yaml.parser.ParserError, yaml.scanner.ScannerError): pass else: try: if ( metadata.get("jupytext", {}) .get("text_representation", {}) .get("format_name", "") == MYST_FORMAT_NAME ): return True except AttributeError: pass # is there at least on fenced code block with a code/raw directive language for token in tokens: if token.type == "fence" and ( token.info.startswith(code_directive) or token.info.startswith(raw_directive) ): return True return False
def parse_markdown( self, text: str, parent: Optional[nodes.Node] = None ) -> List[nodes.Node]: """Parse text as CommonMark, in a new document.""" parser = default_parser(MdParserConfig(commonmark_only=True)) parent = parent or nodes.container() parser.options["current_node"] = parent parser.render(text) # TODO is there any transforms we should retroactively carry out? return parent.children
def parse(self, inputstring: str, document: nodes.document) -> None: """Parse source text. :param inputstring: The source string to parse :param document: The root docutils node to add AST elements to """ config = document.settings.env.myst_config parser = default_parser(config) parser.options["document"] = document env: dict = {} tokens = parser.parse(inputstring, env) if not tokens or tokens[0].type != "front_matter": # we always add front matter, so that we can merge it with global keys, # specified in the sphinx configuration tokens = [Token("front_matter", "", 0, content="{}", map=[0, 0]) ] + tokens header_text = None if tokens[0].type == 'front_matter': # # Hugo article migration # # * get title from frontmatter(yaml) # import pathlib path = pathlib.Path(document.current_source) title = path.stem if title in ('index', '_index'): title = path.parent.stem try: import yaml data = yaml.safe_load(tokens[0].content) title = data['title'] except Exception as ex: pass header_text = Token("text", "", 0, content=title, map=tokens[0].map) tokens = [ tokens[0], Token("heading_open", "h1", 1, content="{}", map=header_text.map), Token("inline", "", 0, content="{}", map=header_text.map, children=[header_text]), Token("heading_close", "h1", -1, content="{}", map=header_text.map) ] + tokens[1:] parser.renderer.render(tokens, parser.options, env)
def parse( self, inputstring: str, document: nodes.document, renderer: str = "sphinx" ): """Parse source text. :param inputstring: The source string to parse :param document: The root docutils node to add AST elements to """ if renderer == "sphinx": config = document.settings.env.myst_config else: config = MdParserConfig() parser = default_parser(config) parser.options["document"] = document parser.render(inputstring)
def parse(self, inputstring: str, document: nodes.document) -> None: """Parse source text. :param inputstring: The source string to parse :param document: The root docutils node to add AST elements to """ config = MdParserConfig(renderer="docutils", enable_extensions=['linkify']) parser = default_parser(config) parser.options["document"] = document env = AttrDict() tokens = parser.parse(inputstring, env) if not tokens or tokens[0].type != "front_matter": # we always add front matter, so that we can merge it with global keys, # specified in the sphinx configuration tokens = [Token("front_matter", "", 0, content="{}", map=[0, 0])] + tokens parser.renderer.render(tokens, parser.options, env)
def to_sphinx( filename: Iterable[str], parser_config: Optional[MdParserConfig] = None, options=None, env=None, document=None, conf=None, srcdir=None, with_builder="singlehtml", ): """Render text to the docutils AST (before transforms) :param text: the text to render :param options: options to update the parser with :param env: The sandbox environment for the parse (will contain e.g. reference definitions) :param document: the docutils root node to use (otherwise a new one will be created) :param in_sphinx_env: initialise a minimal sphinx environment (useful for testing) :param conf: the sphinx conf.py as a dictionary :param srcdir: to parse to the mock sphinx env :returns: docutils document """ from myst_parser.docutils_renderer import make_document md = default_parser(parser_config or MdParserConfig()) if options: md.options.update(options) md.options["document"] = document or make_document() force_all = False with mock_sphinx_env_compat( conf=conf, srcdir=srcdir, document=md.options["document"], with_builder=with_builder, ) as app: app.build(force_all, (filename, )) filehtml = Path(filename).with_suffix(".html").name output = (Path(app.outdir) / filehtml).read_text() return get_div_body(output)
def replace_admonition_in_cell_source(cell_str): """Returns cell source with admonition replaced by its generated HTML. """ config = MdParserConfig(renderer="docutils") parser = default_parser(config) tokens = parser.parse(cell_str) admonition_tokens = [ t for t in tokens if t.type == "fence" and t.info in all_directive_names ] cell_lines = cell_str.splitlines() new_cell_str = cell_str for t in admonition_tokens: adm_begin, adm_end = t.map adm_src = "\n".join(cell_lines[adm_begin:adm_end]) adm_doc = parser.render(adm_src) adm_html = admonition_html(adm_doc) new_cell_str = new_cell_str.replace(adm_src, adm_html) return new_cell_str
def myst_to_notebook( text, code_directive=CODE_DIRECTIVE, raw_directive=RAW_DIRECTIVE, add_source_map=False, ): """Convert text written in the myst format to a notebook. :param text: the file text :param code_directive: the name of the directive to search for containing code cells :param raw_directive: the name of the directive to search for containing raw cells :param add_source_map: add a `source_map` key to the notebook metadata, which is a list of the starting source line number for each cell. :raises MystMetadataParsingError if the metadata block is not valid JSON/YAML NOTE: we assume here that all of these directives are at the top-level, i.e. not nested in other directives. """ # parse markdown file up to the block level (i.e. don't worry about inline text) parser = default_parser("html", disable_syntax=["inline"]) tokens = parser.parse(text + "\n") lines = text.splitlines() md_start_line = 0 # get the document metadata metadata_nb = {} if tokens[0].type == "front_matter": metadata = tokens.pop(0) md_start_line = metadata.map[1] try: metadata_nb = yaml.safe_load(metadata.content) except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: raise MystMetadataParsingError( "Notebook metadata: {}".format(error)) # create an empty notebook nbf_version = nbf.v4 kwargs = {"metadata": nbf.from_dict(metadata_nb)} notebook = nbf_version.new_notebook(**kwargs) source_map = [] # this is a list of the starting line number for each cell def _flush_markdown(start_line, token, md_metadata): """When we find a cell we check if there is preceding text.o""" endline = token.map[0] if token else len(lines) md_source = strip_blank_lines("\n".join(lines[start_line:endline])) meta = nbf.from_dict(md_metadata) if md_source: source_map.append(start_line) notebook.cells.append( nbf_version.new_markdown_cell(source=md_source, metadata=meta)) # iterate through the tokens to identify notebook cells nesting_level = 0 md_metadata = {} for token in tokens: nesting_level += token.nesting if nesting_level != 0: # we ignore fenced block that are nested, e.g. as part of lists, etc continue if token.type == "fence" and token.info.startswith(code_directive): _flush_markdown(md_start_line, token, md_metadata) options, body_lines = read_fenced_cell(token, len(notebook.cells), "Code") meta = nbf.from_dict(options) source_map.append(token.map[0] + 1) notebook.cells.append( nbf_version.new_code_cell(source="\n".join(body_lines), metadata=meta)) md_metadata = {} md_start_line = token.map[1] elif token.type == "fence" and token.info.startswith(raw_directive): _flush_markdown(md_start_line, token, md_metadata) options, body_lines = read_fenced_cell(token, len(notebook.cells), "Raw") meta = nbf.from_dict(options) source_map.append(token.map[0] + 1) notebook.cells.append( nbf_version.new_raw_cell(source="\n".join(body_lines), metadata=meta)) md_metadata = {} md_start_line = token.map[1] elif token.type == "myst_block_break": _flush_markdown(md_start_line, token, md_metadata) md_metadata = read_cell_metadata(token, len(notebook.cells)) md_start_line = token.map[1] _flush_markdown(md_start_line, None, md_metadata) if add_source_map: notebook.metadata["source_map"] = source_map return notebook
def nb_to_tokens( ntbk: nbf.NotebookNode, config: MdParserConfig, renderer_plugin: str) -> Tuple[MarkdownIt, AttrDict, List[Token]]: """Parse the notebook content to a list of syntax tokens and an env, containing global data like reference definitions. """ md = default_parser(config) # setup the markdown parser # Note we disable front matter parsing, # because this is taken from the actual notebook metadata md.disable("front_matter", ignoreInvalid=True) md.renderer = SphinxNBRenderer(md) # make a sandbox where all the parsing global data, # like reference definitions will be stored env = AttrDict() rules = md.core.ruler.get_active_rules() # First only run pre-inline chains # so we can collect all reference definitions, etc, before assessing references def parse_block(src, start_line): with md.reset_rules(): # enable only rules up to block md.core.ruler.enableOnly(rules[:rules.index("inline")]) tokens = md.parse(src, env) for token in tokens: if token.map: token.map = [ start_line + token.map[0], start_line + token.map[1] ] for dup_ref in env.get("duplicate_refs", []): if "fixed" not in dup_ref: dup_ref["map"] = [ start_line + dup_ref["map"][0], start_line + dup_ref["map"][1], ] dup_ref["fixed"] = True return tokens block_tokens = [] source_map = ntbk.metadata.get("source_map", None) # get language lexer name langinfo = ntbk.metadata.get("language_info", {}) lexer = langinfo.get("pygments_lexer", langinfo.get("name", None)) # TODO log warning if lexer is still None for cell_index, nb_cell in enumerate(ntbk.cells): # if the the source_map has been stored (for text-based notebooks), # we use that do define the starting line for each cell # otherwise, we set a pseudo base that represents the cell index start_line = source_map[cell_index] if source_map else (cell_index + 1) * 10000 start_line += 1 # use base 1 rather than 0 # Skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal # TODO this logic should be deferred to a transform tags = nb_cell.metadata.get("tags", []) if ("remove_cell" in tags) or ("remove-cell" in tags): continue if nb_cell["cell_type"] == "markdown": # we add the cell index to tokens, # so they can be included in the error logging, block_tokens.extend(parse_block(nb_cell["source"], start_line)) elif nb_cell["cell_type"] == "code": # here we do nothing but store the cell as a custom token block_tokens.append( Token( "nb_code_cell", "", 0, meta={ "cell": nb_cell, "lexer": lexer, "renderer": renderer_plugin }, map=[start_line, start_line], )) # Now all definitions have been gathered, # we run inline and post-inline chains, to expand the text. # Note we assume here that these rules never require the actual source text, # only acting on the existing tokens state = StateCore(None, md, env, block_tokens) with md.reset_rules(): md.core.ruler.enableOnly(rules[rules.index("inline"):]) md.core.process(state) # Add the front matter. # Note that myst_parser serialises dict/list like keys, when rendering to # docutils docinfo. These could be read back with `json.loads`. state.tokens = [ Token("front_matter", "", 0, content=({k: v for k, v in ntbk.metadata.items()})) ] + state.tokens # If there are widgets, this will embed the state of all widgets in a script if contains_widgets(ntbk): state.tokens.append( Token("jupyter_widget_state", "", 0, meta={"state": get_widgets(ntbk)})) return md, env, state.tokens
def to_model(myst): md = default_parser("docutils") tokens = md.parse(myst) sections = _split_sections(tokens) return sections """