def tokenize_main( lines: SourceLines, token_types=None, expand_spans: bool = True, skip_tokens: list = ("LinkDefinition", "Footnote"), ): """Searches for token_types in an iterable. :param lines: the source lines :param token_types: override block-level tokens set in global context :param start_line: the source line number corresponding to `iterable[0]` :param expand_spans: After the initial parse the span text is not yet tokenized, but stored instead as raw text in `SpanContainer`, in order to ensure all link definitons are read first. Setting True, runs a second walk of the syntax tree to replace these `SpanContainer` with the final span tokens. :param skip_tokens: do not store these ``token.name`` in the syntax tree. These are usually tokens that store themselves in the global context :returns: list of block-level token instances. """ if not isinstance(lines, SourceLines): lines = SourceLines(lines) if token_types is None: token_types = get_parse_context().block_tokens tokens = tokenize_block(lines, token_types=token_types, skip_tokens=skip_tokens) if expand_spans: for token in tokens + list( get_parse_context().foot_definitions.values()): token.expand_spans() return tokens
def test_repeated_footnote(caplog): get_parse_context().block_tokens.insert_before( block_tokens_ext.Footnote, block_tokens.LinkDefinition ) tokenize_main(["[^1]: value1\n", "[^1]: value2\n"]) assert "ignoring duplicate footnote definition" in caplog.text assert len(get_parse_context().foot_definitions) == 1
def read(cls, lines: SourceLines): start_line = lines.lineno + 1 next(lines) # skip first ``---`` line_buffer = [] next_line = lines.peek() while not (next_line is None or next_line.startswith("---")): line_buffer.append(next(lines)) next_line = lines.peek() if next_line is not None: next(lines) # move past closing ``---`` log_warning = False else: log_warning = True position = Position.from_source_lines(lines, start_line=start_line) if log_warning: get_parse_context().logger.warning( "{} No closing `---` was found for initial metadata block".format( position.make_loc_str() ) ) return cls(content="".join(line_buffer), position=position)
def test_foot_ref_span(name, source, data_regression): get_parse_context().foot_definitions["a"] = True _span_tokens = get_parse_context().span_tokens _span_tokens.insert_after(FootReference, CoreTokens) data_regression.check( serialize_tokens(tokenize_span(source), as_dict=True), basename=f"test_foot_ref_span_{name}", )
def test_resolution(name, source, data_regression): get_parse_context().span_tokens.insert_after(FootReference, CoreTokens) get_parse_context().block_tokens.insert_before( block_tokens_ext.Footnote, block_tokens.LinkDefinition ) data_regression.check( serialize_tokens(block_tokens.Document.read(source), as_dict=True), basename=f"test_resolution_{name}", )
def test_foot_definition(name, source, data_regression): get_parse_context().block_tokens.insert_before( block_tokens_ext.Footnote, block_tokens.LinkDefinition ) tree = serialize_tokens(tokenize_main(source), as_dict=True) footnotes = serialize_tokens(get_parse_context().foot_definitions, as_dict=True) data_regression.check( { "tree": tree, "footnotes": footnotes, "link_definitions": get_parse_context().link_definitions, }, basename=f"test_foot_definitions_{name}", )
def append_link_definitions(matches, position): for key, dest, title in matches: key = normalize_label(key) dest = span_tokens.EscapeSequence.strip(dest.strip()) title = span_tokens.EscapeSequence.strip(title) link_definitions = get_parse_context().link_definitions if key not in link_definitions: link_definitions[key] = dest, title else: get_parse_context().logger.warning( "{} ignoring duplicate link definition '{}'".format( position.make_loc_str(), key ) )
def render_document(self, token: block_tokens.Document): if token.front_matter: self.render_front_matter(token.front_matter) self.render_children(token) if getattr(token, "is_nested", False): # if the document is nested in another, we don't want to output footnotes return self.document # we use the footnotes stored in the global context, # rather than those stored on the document, # since additional references may have been made in nested parses footnotes = get_parse_context().foot_definitions # we don't use the foot_references stored on the global context, # since references within directives/roles will have been added after # those from the initial markdown parse # instead we gather them from a walk of the created document # foot_refs = get_parse_context().foot_references foot_refs = OrderedDict() for refnode in self.document.traverse(nodes.footnote_reference): if refnode["refname"] not in foot_refs: foot_refs[refnode["refname"]] = True if foot_refs: self.current_node.append(nodes.transition()) for footref in foot_refs: if footref in footnotes: self.render_footnote(footnotes[footref]) return self.document
def inline_text(self, text: str, lineno: int): # TODO return messages? messages = [] paragraph = nodes.paragraph("") # here we instatiate a new renderer, # so that the nested parse does not effect the current renderer, # but we use the same global parse context, so that link references, etc # are added to the global parse. renderer = self._renderer.__class__( document=self.document, current_node=paragraph, parse_context=get_parse_context(), ) lines = SourceLines( text, start_line=self._lineno, uri=self.document["source"], metadata=self._token.position.data, standardize_ends=True, ) doc_token = myst_block_tokens.Document.read( lines, front_matter=False, reset_definitions=False ) # we mark the token as nested so that footnotes etc aren't rendered doc_token.is_nested = True renderer.render(doc_token) textnodes = [] if paragraph.children: # first child should be paragraph textnodes = paragraph.children[0].children return textnodes, messages
def match_link_label(string, offset): start = -1 end = -1 escaped = False for i, c in enumerate(string[offset:], start=offset): if c == "\\" and not escaped: escaped = True elif c == "[" and not escaped: if start == -1: start = i else: return None elif c == "]" and not escaped: end = i label = string[start + 1:end] match_info = start, end + 1, label if label.strip() != "": link_definitions = get_parse_context().link_definitions ref = link_definitions.get(normalize_label(label), None) if ref is not None: return match_info, ref return None return None elif escaped: escaped = False return None
def test_link_definitions(name, source, data_regression): tree = serialize_tokens(tokenize_main(source), as_dict=True) data_regression.check( { "tree": tree, "link_definitions": get_parse_context().link_definitions }, basename=f"test_link_definitions_{name}", )
def read( cls, lines: Union[str, ListType[str], SourceLines], reset_definitions: bool = True, skip_tokens: list = ("LinkDefinition", "Footnote"), front_matter: bool = False, ): """Read a document :param lines: Lines to parse :param reset_definitions: remove any previously stored definitions in the global context (see ``ParseContext.reset_definitions()``). :param skip_tokens: do not store these ``token.name`` in the syntax tree. These are usually tokens that store themselves in the global context. :param front_matter: search for an initial YAML block front matter block (note this is not strictly CommonMark compliant) """ if reset_definitions: get_parse_context().reset_definitions() if not isinstance(lines, SourceLines): lines = SourceLines(lines, standardize_ends=True) # TODO can we do this in a way where we are checking # FrontMatter in get_parse_context().block_tokens? # then it would be easier to add/remove it in the renderers front_matter_token = None if front_matter and lines.peek() and lines.peek().startswith("---"): front_matter_token = FrontMatter.read(lines) children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens) foot_defs = get_parse_context().foot_definitions return cls( children=children, front_matter=front_matter_token, link_definitions=get_parse_context().link_definitions, footnotes=foot_defs, footref_order=[ t for t in get_parse_context().foot_references if t in foot_defs ], )
def is_link_label(text): escaped = False for c in text: if c == "\\" and not escaped: escaped = True elif (c == "[" or c == "]") and not escaped: return None elif escaped: escaped = False if text.strip() != "": link_definitions = get_parse_context().link_definitions return link_definitions.get(normalize_label(text), None) return None
def tokenize_span(string, token_types=None): """Convert a string to a list of span tokens. :param string: the string to parse :param token_types: override block-level tokens set in global context :returns: list of span-level token instances. """ if token_types is None: token_types = get_parse_context().span_tokens *token_types, fallback_token = token_types tokens = find_tokens(string, token_types, fallback_token) token_buffer = [] if tokens: prev = tokens[0] for curr in tokens[1:]: prev = eval_tokens(prev, curr, token_buffer) token_buffer.append(prev) return make_tokens(token_buffer, 0, len(string), string, fallback_token)
def tokenize_block(lines: SourceLines, token_types=None, skip_tokens=("LinkDefinition", "Footnote")): """Returns a list of parsed tokens.""" assert isinstance(lines, SourceLines), "lines must be `SourceLines` instance" if token_types is None: token_types = get_parse_context().block_tokens parsed_tokens = ParseBuffer() line = lines.peek() while line is not None: for token_type in token_types: if token_type.start(line): token = token_type.read(lines) if token is not None: if token.name not in skip_tokens: parsed_tokens.append(token) break else: # unmatched newlines next(lines) parsed_tokens.loose = True line = lines.peek() return parsed_tokens
def find_nested_tokenizer(string): get_parse_context().nesting_matches = {} # reset nesting matches if not string: return [] delimiters = [] matches = [] escaped = False # escaped denotes that the last cursor position had `\` in_delimiter_run = None # delimiter runs are sequences of `*` or `_` in_image = False start = 0 i = 0 has_math = Math in get_parse_context().span_tokens has_strikethrough = Strikethrough in get_parse_context().span_tokens has_footrefs = FootReference in get_parse_context().span_tokens code_match, strike_match, math_match = advance_searches( string, 0, has_strikethrough, has_math) while i < len(string): if strike_match is not None and i == strike_match.start(): get_parse_context().nesting_matches.setdefault( "Strikethrough", []).append(strike_match) strike_match = Strikethrough.pattern.search(string, i + 1) continue if code_match is not None and i == code_match.start(): if in_delimiter_run: delimiters.append(Delimiter(start, i, string)) in_delimiter_run = None get_parse_context().nesting_matches.setdefault( "InlineCode", []).append(code_match) i = code_match.end() code_match, strike_match, math_match = advance_searches( string, i, has_strikethrough, has_math) continue if math_match is not None and i == math_match.start(): if in_delimiter_run: delimiters.append(Delimiter(start, i, string)) in_delimiter_run = None get_parse_context().nesting_matches.setdefault( "Math", []).append(math_match) i = math_match.end() code_match, strike_match, math_match = advance_searches( string, i, has_strikethrough, has_math) continue c = string[i] if c == "\\" and not escaped: escaped = True i += 1 continue if in_delimiter_run is not None and (c != in_delimiter_run or escaped): delimiters.append( Delimiter(start, i if not escaped else i - 1, string)) in_delimiter_run = None if in_delimiter_run is None and (c == "*" or c == "_") and not escaped: in_delimiter_run = c start = i if not escaped: if c == "[": foot_ref_match = match_foot_ref(string, i) if has_footrefs else None if foot_ref_match: get_parse_context().nesting_matches.setdefault( "FootReference", []).append(foot_ref_match) i = foot_ref_match.end() in_image = False continue if not in_image: delimiters.append(Delimiter(i, i + 1, string)) else: delimiters.append(Delimiter(i - 1, i + 1, string)) in_image = False elif c == "!": in_image = True elif c == "]": i = find_link_image(string, i, delimiters, matches) code_match, strike_match, math_match = advance_searches( string, i, has_strikethrough, has_math) elif in_image: in_image = False else: escaped = False i += 1 if in_delimiter_run: delimiters.append(Delimiter(start, i, string)) process_emphasis(string, None, delimiters, matches) return matches
def test_repeated_link_defs(caplog): tokenize_main(["[a]: value1\n", "[a]: value2\n"]) assert "ignoring duplicate link definition" in caplog.text assert len(get_parse_context().link_definitions) == 1
def read(cls, match: Pattern): target = match.group(1) # add the targets to an ordered set, so we record the order of reference get_parse_context().foot_references.add(target) return cls(target=target)
def find(cls, string): matches = get_parse_context().nesting_matches.pop("Math", []) return matches
def parse(self, inputstring, document): # de-serialize the notebook ntbk = nbf.reads(inputstring, nbf.NO_CONVERT) # This is a contaner for top level markdown tokens # which we will add to as we walk the document mkdown_tokens = [] # type: list[BlockToken] # First we ensure that we are using a 'clean' global context # for parsing, which is setup with the MyST parsing tokens # the logger will report on duplicate link/footnote definitions, etc parse_context = ParseContext( find_blocks=SphinxNBRenderer.default_block_tokens, find_spans=SphinxNBRenderer.default_span_tokens, logger=SPHINX_LOGGER, ) set_parse_context(parse_context) for cell_index, nb_cell in enumerate(ntbk.cells): # Skip empty cells if len(nb_cell["source"].strip()) == 0: continue # skip cells tagged for removal tags = nb_cell.metadata.get("tags", []) if "remove_cell" in tags: continue if nb_cell["cell_type"] == "markdown": # we add the document path and cell index # to the source lines, so they can be included in the error logging # NOTE: currently the logic to report metadata is not written # into SphinxRenderer, but this will be introduced in a later update lines = SourceLines( nb_cell["source"], uri=document["source"], metadata={"cell_index": cell_index}, standardize_ends=True, ) # parse the source markdown text; # at this point span/inline level tokens are not yet processed, but # link/footnote definitions are collected/stored in the global context mkdown_tokens.extend(tokenize_block(lines)) # TODO for md cells, think of a way to implement the previous # `if "hide_input" in tags:` logic elif nb_cell["cell_type"] == "code": # here we do nothing but store the cell as a custom token mkdown_tokens.append( NbCodeCell( cell=nb_cell, position=Position( line_start=0, uri=document["source"], data={"cell_index": cell_index}, ), )) # Now all definitions have been gathered, we walk the tokens and # process any inline text for token in mkdown_tokens + list( get_parse_context().foot_definitions.values()): token.expand_spans() # If there are widgets, this will embed the state of all widgets in a script if contains_widgets(ntbk): mkdown_tokens.insert(0, JupyterWidgetState(state=get_widgets(ntbk))) # create the front matter token front_matter = FrontMatter(content=ntbk.metadata, position=None) # Finally, we create the top-level markdown document markdown_doc = Document( children=mkdown_tokens, front_matter=front_matter, link_definitions=parse_context.link_definitions, footnotes=parse_context.foot_definitions, footref_order=parse_context.foot_references, ) self.reporter = document.reporter self.config = self.default_config.copy() try: new_cfg = document.settings.env.config.myst_config self.config.update(new_cfg) except AttributeError: pass # Remove all the mime prefixes from "glue" step. # This way, writing properly captures the glued images replace_mime = [] for cell in ntbk.cells: if hasattr(cell, "outputs"): for out in cell.outputs: if "data" in out: # Only do the mimebundle replacing for the scrapbook outputs mime_prefix = (out.get("metadata", {}).get("scrapbook", {}).get("mime_prefix")) if mime_prefix: out["data"] = { key.replace(mime_prefix, ""): val for key, val in out["data"].items() } replace_mime.append(out) # Write the notebook's output to disk. This changes metadata in notebook cells path_doc = Path(document.settings.env.docname) doc_relpath = path_doc.parent doc_filename = path_doc.name build_dir = Path(document.settings.env.app.outdir).parent output_dir = build_dir.joinpath("jupyter_execute", doc_relpath) write_notebook_output(ntbk, str(output_dir), doc_filename) # Now add back the mime prefixes to the right outputs so they aren't rendered # until called from the role/directive for out in replace_mime: out["data"] = { f"{GLUE_PREFIX}{key}": val for key, val in out["data"].items() } # Update our glue key list with new ones defined in this page glue_domain = NbGlueDomain.from_env(document.settings.env) glue_domain.add_notebook(ntbk, path_doc) # render the Markdown AST to docutils AST renderer = SphinxNBRenderer(parse_context=parse_context, document=document, current_node=None) renderer.render(markdown_doc)
def find(cls, string): matches = get_parse_context().nesting_matches.pop("FootReference", []) return matches
def myst_to_notebook( text, code_directive=CODE_DIRECTIVE, raw_directive=RAW_DIRECTIVE, ignore_bad_meta=False, store_line_numbers=False, ): """Convert text written in the myst format to a notebook. :param text: the file text :param code_directive: the name of the directive to search for containing code cells :param raw_directive: the name of the directive to search for containing raw cells :param ignore_bad_meta: ignore metadata that cannot be parsed as JSON/YAML :param store_line_numbers: add a `_source_lines` key to cell metadata, mapping to the source text. NOTE: we assume here that all of these directives are at the top-level, i.e. not nested in other directives. """ from mistletoe.base_elements import SourceLines from mistletoe.parse_context import ( ParseContext, get_parse_context, set_parse_context, ) from mistletoe.block_tokens import Document, CodeFence from myst_parser.block_tokens import BlockBreak from myst_parser.parse_directives import DirectiveParsingError, parse_directive_text from myst_parser.docutils_renderer import DocutilsRenderer code_directive = "{{{0}}}".format(code_directive) raw_directive = "{{{0}}}".format(raw_directive) original_context = get_parse_context() parse_context = ParseContext( find_blocks=DocutilsRenderer.default_block_tokens, find_spans=DocutilsRenderer.default_span_tokens, ) if isinstance(text, SourceLines): lines = text else: lines = SourceLines(text, standardize_ends=True) try: set_parse_context(parse_context) doc = Document.read(lines, front_matter=True) metadata_nb = {} try: metadata_nb = doc.front_matter.get_data() if doc.front_matter else {} except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error: if not ignore_bad_meta: raise MystMetadataParsingError("Notebook metadata: {}".format(error)) nbf_version = nbf.v4 kwargs = {"metadata": nbf.from_dict(metadata_nb)} notebook = nbf_version.new_notebook(**kwargs) current_line = 0 if not doc.front_matter else doc.front_matter.position.line_end md_metadata = {} for item in doc.walk(["CodeFence", "BlockBreak"]): if isinstance(item.node, BlockBreak): token = item.node # type: BlockBreak source = _fmt_md( "".join(lines.lines[current_line:token.position.line_start - 1]) ) if source: md_metadata = nbf.from_dict(md_metadata) if store_line_numbers: md_metadata["_source_lines"] = [ current_line, token.position.line_start - 1, ] notebook.cells.append( nbf_version.new_markdown_cell( source=source, metadata=md_metadata, ) ) if token.content: md_metadata = {} try: md_metadata = json.loads(token.content.strip()) except Exception as err: if not ignore_bad_meta: raise MystMetadataParsingError( "markdown cell {0} at {1} could not be read: {2}".format( len(notebook.cells) + 1, token.position, err ) ) if not isinstance(md_metadata, dict): if not ignore_bad_meta: raise MystMetadataParsingError( "markdown cell {0} at {1} is not a dict".format( len(notebook.cells) + 1, token.position ) ) else: md_metadata = {} current_line = token.position.line_start if isinstance(item.node, CodeFence) and item.node.language in [ code_directive, raw_directive, ]: token = item.node # type: CodeFence # Note: we ignore anything after the directive on the first line # this is reserved for the optional lexer name # TODO: could log warning about if token.arguments != lexer name options, body_lines = {}, [] try: _, options, body_lines = parse_directive_text( directive_class=MockDirective, argument_str="", content=token.children[0].content, validate_options=False, ) except DirectiveParsingError as err: if not ignore_bad_meta: raise MystMetadataParsingError( "Code cell {0} at {1} could not be read: {2}".format( len(notebook.cells) + 1, token.position, err ) ) md_source = _fmt_md( "".join(lines.lines[current_line:token.position.line_start - 1]) ) if md_source: md_metadata = nbf.from_dict(md_metadata) if store_line_numbers: md_metadata["_source_lines"] = [ current_line, token.position.line_start - 1, ] notebook.cells.append( nbf_version.new_markdown_cell( source=md_source, metadata=md_metadata, ) ) current_line = token.position.line_end md_metadata = {} cell_metadata = nbf.from_dict(options) if store_line_numbers: cell_metadata["_source_lines"] = [ token.position.line_start, token.position.line_end, ] if item.node.language == code_directive: notebook.cells.append( nbf_version.new_code_cell( source="\n".join(body_lines), metadata=cell_metadata, ) ) if item.node.language == raw_directive: notebook.cells.append( nbf_version.new_raw_cell( source="\n".join(body_lines), metadata=cell_metadata, ) ) # add the final markdown cell (if present) if lines.lines[current_line:]: md_metadata = nbf.from_dict(md_metadata) if store_line_numbers: md_metadata["_source_lines"] = [current_line, len(lines.lines)] notebook.cells.append( nbf_version.new_markdown_cell( source=_fmt_md("".join(lines.lines[current_line:])), metadata=md_metadata, ) ) finally: set_parse_context(original_context) return notebook
def reset_parse_context(): """Ensure the parse context is reset before each test.""" from mistletoe.parse_context import get_parse_context get_parse_context(reset=True)
def match_foot_ref(string, offset): match = FootReference.pattern.match(string[offset:]) if not match: return if match.group(1) in get_parse_context().foot_definitions: return MatchObj(offset, match.end() + offset, (-1, -1, match.group(1)))