def _replace_unwanted_commands_with_spaces(tex: str) -> str: """ KaTeX isn't programmed to support the entire vocabulary of LaTeX equation markup (though it does support a lot, see https://katex.org/docs/support_table.html). For those commands that we don't need to have parsed (e.g., 'label'), this function will strip those commands out, so that they cause KaTeX to crash or have unexpected behavior. 'label', for example, if not removed, will have its argument parsed as an equation, and will be identified as consisting of many symbols. """ UNWANTED_MACROS = [ MacroDefinition("ref", "#1"), MacroDefinition("label", "#1"), MacroDefinition("nonumber", ""), ] macro_extractor = MacroExtractor() for macro_definition in UNWANTED_MACROS: for macro in macro_extractor.parse(tex, macro_definition): tex = _replace_substring_with_space(tex, macro.start, macro.end) length_assignment_extractor = EquationLengthAssignmentExtractor() length_assignments = length_assignment_extractor.parse(tex) for assignment in length_assignments: tex = _replace_substring_with_space(tex, assignment.start, assignment.end) UNWANTED_PATTERNS = [ Pattern("ampersand", "&"), Pattern("split_start", begin_environment_regex("split")), Pattern("split_end", end_environment_regex("split")), ] unwanted_matches = scan_tex(tex, UNWANTED_PATTERNS) for match in unwanted_matches: tex = _replace_substring_with_space(tex, match.start, match.end) return tex
def parse(self, tex: str) -> Optional[EndDocument]: pattern = Pattern("begin_document", r"\\end{document}") scanner = scan_tex(tex, [pattern], include_unmatched=False) try: match = next(scanner) return EndDocument(match.start, match.end) except StopIteration: return None
def parse(self, tex_path: str, tex: str) -> Iterator[Equation]: self._stack: List[Match] = [] # pylint: disable=attribute-defined-outside-init self._tex = tex # pylint: disable=attribute-defined-outside-init self._tex_path = tex_path # pylint: disable=attribute-defined-outside-init self._equation_index = 0 # pylint: disable=attribute-defined-outside-init scanner = scan_tex(tex, self.PATTERNS) for match in scanner: for equation in self._process_token(match): yield equation
def parse(self, tex: str) -> Iterator[LengthAssignment]: parameter_names_pattern = ( r"(?:" + "|".join([r"\\" + p for p in ARRAY_PARAMETERS]) + ")" ) unit_pattern = r"(?:" + "|".join(LENGTH_UNITS) + ")" assignment_pattern = ( parameter_names_pattern + r"\s*=\s*[0-9\.]+\s*" + unit_pattern ) pattern = Pattern("length_assignment", assignment_pattern) scanner = scan_tex(tex, [pattern]) for match in scanner: yield LengthAssignment(match.start, match.end)
def parse(self, tex: str) -> Optional[Documentclass]: patterns = [ Pattern("documentclass", r"\\documentclass"), Pattern("optional_arg", r"\[[^\]]*?\]"), Pattern("required_arg", r"{[^}]*?}"), ] match_stage = "start" start: int = -1 required_arg = None scanner = scan_tex(tex, patterns, include_unmatched=True) for match in scanner: if match_stage == "start": if match.pattern.name != "documentclass": continue start = match.start match_stage = "awaiting-required-arg" # Once we hit a token that's not the document class or argument, return the document # class if the required argument has been found; otherwise, abort. elif match.pattern.name == "UNKNOWN": if match_stage == "awaiting-optional-arg": return Documentclass(start, match.start) if not match.text.isspace(): break elif match_stage == "awaiting-required-arg": if match.pattern.name == "required_arg": match_stage = "awaiting-optional-arg" required_arg = match elif match_stage == "awaiting-optional-arg": if match.pattern.name == "optional_arg": end = match.end return Documentclass(start, end) if required_arg is not None: return Documentclass(start, required_arg.end) return None