def tokenize_spacers(text, prev=None): r"""Combine spacers [ + line break [ + spacer]] >>> tokenize_spacers(categorize('\t\n{there')) '\t\n' >>> tokenize_spacers(categorize('\t\nthere')) >>> tokenize_spacers(categorize(' \t ')) ' \t ' >>> tokenize_spacers(categorize(r' ccc')) """ result = Token('', text.position) while text.hasNext() and text.peek().category == CC.Spacer: result += text.forward(1) if text.hasNext() and text.peek().category == CC.EndOfLine: result += text.forward(1) while text.hasNext() and text.peek().category == CC.Spacer: result += text.forward(1) result.category = TC.MergedSpacer if text.hasNext() and text.peek().category in (CC.Letter, CC.Other): text.backward(text.position - result.position) return if result: return result
def tokenize_line_comment(text, prev=None): r"""Process a line comment :param Buffer text: iterator over line, with current position >>> tokenize_line_comment(categorize('%hello world\\')) '%hello world\\' >>> tokenize_line_comment(categorize('hello %world')) >>> tokenize_line_comment(categorize('%}hello world')) '%}hello world' >>> tokenize_line_comment(categorize('%} ')) '%} ' >>> tokenize_line_comment(categorize('%hello\n world')) '%hello' >>> b = categorize(r'\\%') >>> _ = next(b), next(b) >>> tokenize_line_comment(b) '%' >>> tokenize_line_comment(categorize(r'\%')) """ result = Token('', text.position) if text.peek().category == CC.Comment and (prev is None or prev.category != CC.Comment): result += text.forward(1) while text.hasNext() and text.peek().category != CC.EndOfLine: result += text.forward(1) result.category = TC.Comment return result
def tokenize_math_sym_switch(text, prev=None): r"""Group characters in math switches. :param Buffer text: iterator over line, with current position >>> tokenize_math_sym_switch(categorize(r'$\min_x$ \command')) '$' >>> tokenize_math_sym_switch(categorize(r'$$\min_x$$ \command')) '$$' """ if text.peek().category == CC.MathSwitch: if text.peek(1) and text.peek(1).category == CC.MathSwitch: result = Token(text.forward(2), text.position) result.category = TC.DisplayMathSwitch else: result = Token(text.forward(1), text.position) result.category = TC.MathSwitch return result
def categorize(text): r"""Generator for category code tokens on text, ignoring comments. :param Union[str,iterator,Buffer] text: LaTeX to process >>> chars = list(categorize(r'\bf{}%[ello+😂')) >>> chars[0].category <CategoryCodes.Escape: 1> >>> chars[1].category <CategoryCodes.Letter: 12> >>> chars[3].category <CategoryCodes.GroupBegin: 2> >>> chars[4].category <CategoryCodes.GroupEnd: 3> >>> chars[5].category <CategoryCodes.Comment: 15> >>> chars[6].category <CategoryCodes.BracketBegin: 19> >>> chars[-2].category <CategoryCodes.Other: 13> >>> chars[-1].category <CategoryCodes.Other: 13> >>> print(*chars) \ b f { } % [ e l l o + 😂 >>> next(categorize(r''' ... ''')).category <CategoryCodes.EndOfLine: 6> """ for position, char in enumerate(text): value = None for cc, values in CATEGORY_CODES.items(): if char in values: value = char break if value is None: yield Token(char, position, CC.Other) else: yield Token(char, position, cc)
def read_command(buf, n_required_args=-1, n_optional_args=-1, skip=0, tolerance=0, mode=MODE_NON_MATH): r"""Parses command and all arguments. Assumes escape has just been parsed. No whitespace is allowed between escape and command name. e.g., :code:`\ textbf` is a backslash command, then text :code:`textbf`. Only :code:`\textbf` is the bold command. >>> from TexSoup.category import categorize >>> from TexSoup.tokens import tokenize >>> buf = Buffer(tokenize(categorize('\\sect \t \n\t{wallawalla}'))) >>> next(buf) '\\' >>> read_command(buf) ('sect', [BraceGroup('wallawalla')]) >>> buf = Buffer(tokenize(categorize('\\sect \t \n\t \n{bingbang}'))) >>> _ = next(buf) >>> read_command(buf) ('sect', []) >>> buf = Buffer(tokenize(categorize('\\sect{ooheeeee}'))) >>> _ = next(buf) >>> read_command(buf) ('sect', [BraceGroup('ooheeeee')]) >>> buf = Buffer(tokenize(categorize(r'\item aaa {bbb} ccc\end{itemize}'))) >>> read_command(buf, skip=1) ('item', []) >>> buf.peek() ' aaa ' # >>> buf = Buffer(tokenize(categorize('\\sect abcd'))) # >>> _ = next(buf) # >>> read_command(buf) # ('sect', ('a',)) """ for _ in range(skip): next(buf) name = next(buf) token = Token('', buf.position) if n_required_args < 0 and n_optional_args < 0: n_required_args, n_optional_args = SIGNATURES.get(name, (-1, -1)) args = read_args(buf, n_required_args, n_optional_args, tolerance=tolerance, mode=mode) return name, args
def tokenize_string(text, prev=None): r"""Process a string of text :param Buffer text: iterator over line, with current position :param Union[None,iterable,str] delimiters: defines the delimiters >>> tokenize_string(categorize('hello')) 'hello' >>> b = categorize(r'hello again\command') >>> tokenize_string(b) 'hello again' >>> print(b.peek()) \ >>> print(tokenize_string(categorize(r'0 & 1\\\command'))) 0 & 1 """ result = Token('', text.position, category=TC.Text) while text.hasNext() and text.peek().category not in ( CC.Escape, CC.GroupBegin, CC.GroupEnd, CC.MathSwitch, CC.BracketBegin, CC.BracketEnd, CC.Comment): result += next(text) return result
def search_regex(self, pattern): for node in self.text: for match in re.finditer(pattern, node): body = match.group() # group() returns the full match start = match.start() yield Token(body, node.position + start)