Пример #1
0
def tokenize_spacers(text, prev=None):
    r"""Combine spacers [ + line break [ + spacer]]

    >>> tokenize_spacers(categorize('\t\n{there'))
    '\t\n'
    >>> tokenize_spacers(categorize('\t\nthere'))
    >>> tokenize_spacers(categorize('      \t     '))
    '      \t     '
    >>> tokenize_spacers(categorize(r' ccc'))
    """
    result = Token('', text.position)
    while text.hasNext() and text.peek().category == CC.Spacer:
        result += text.forward(1)
    if text.hasNext() and text.peek().category == CC.EndOfLine:
        result += text.forward(1)
    while text.hasNext() and text.peek().category == CC.Spacer:
        result += text.forward(1)
    result.category = TC.MergedSpacer

    if text.hasNext() and text.peek().category in (CC.Letter, CC.Other):
        text.backward(text.position - result.position)
        return

    if result:
        return result
Пример #2
0
def tokenize_line_comment(text, prev=None):
    r"""Process a line comment

    :param Buffer text: iterator over line, with current position

    >>> tokenize_line_comment(categorize('%hello world\\'))
    '%hello world\\'
    >>> tokenize_line_comment(categorize('hello %world'))
    >>> tokenize_line_comment(categorize('%}hello world'))
    '%}hello world'
    >>> tokenize_line_comment(categorize('%}  '))
    '%}  '
    >>> tokenize_line_comment(categorize('%hello\n world'))
    '%hello'
    >>> b = categorize(r'\\%')
    >>> _ = next(b), next(b)
    >>> tokenize_line_comment(b)
    '%'
    >>> tokenize_line_comment(categorize(r'\%'))
    """
    result = Token('', text.position)
    if text.peek().category == CC.Comment and (prev is None
                                               or prev.category != CC.Comment):
        result += text.forward(1)
        while text.hasNext() and text.peek().category != CC.EndOfLine:
            result += text.forward(1)
        result.category = TC.Comment
        return result
Пример #3
0
def tokenize_math_sym_switch(text, prev=None):
    r"""Group characters in math switches.

    :param Buffer text: iterator over line, with current position

    >>> tokenize_math_sym_switch(categorize(r'$\min_x$ \command'))
    '$'
    >>> tokenize_math_sym_switch(categorize(r'$$\min_x$$ \command'))
    '$$'
    """
    if text.peek().category == CC.MathSwitch:
        if text.peek(1) and text.peek(1).category == CC.MathSwitch:
            result = Token(text.forward(2), text.position)
            result.category = TC.DisplayMathSwitch
        else:
            result = Token(text.forward(1), text.position)
            result.category = TC.MathSwitch
        return result
Пример #4
0
def categorize(text):
    r"""Generator for category code tokens on text, ignoring comments.

    :param Union[str,iterator,Buffer] text: LaTeX to process

    >>> chars = list(categorize(r'\bf{}%[ello+😂'))
    >>> chars[0].category
    <CategoryCodes.Escape: 1>
    >>> chars[1].category
    <CategoryCodes.Letter: 12>
    >>> chars[3].category
    <CategoryCodes.GroupBegin: 2>
    >>> chars[4].category
    <CategoryCodes.GroupEnd: 3>
    >>> chars[5].category
    <CategoryCodes.Comment: 15>
    >>> chars[6].category
    <CategoryCodes.BracketBegin: 19>
    >>> chars[-2].category
    <CategoryCodes.Other: 13>
    >>> chars[-1].category
    <CategoryCodes.Other: 13>
    >>> print(*chars)
    \ b f { } % [ e l l o + 😂
    >>> next(categorize(r'''
    ... ''')).category
    <CategoryCodes.EndOfLine: 6>
    """
    for position, char in enumerate(text):

        value = None
        for cc, values in CATEGORY_CODES.items():
            if char in values:
                value = char
                break

        if value is None:
            yield Token(char, position, CC.Other)
        else:
            yield Token(char, position, cc)
Пример #5
0
def read_command(buf,
                 n_required_args=-1,
                 n_optional_args=-1,
                 skip=0,
                 tolerance=0,
                 mode=MODE_NON_MATH):
    r"""Parses command and all arguments. Assumes escape has just been parsed.

    No whitespace is allowed between escape and command name. e.g.,
    :code:`\ textbf` is a backslash command, then text :code:`textbf`. Only
    :code:`\textbf` is the bold command.

    >>> from TexSoup.category import categorize
    >>> from TexSoup.tokens import tokenize
    >>> buf = Buffer(tokenize(categorize('\\sect  \t    \n\t{wallawalla}')))
    >>> next(buf)
    '\\'
    >>> read_command(buf)
    ('sect', [BraceGroup('wallawalla')])
    >>> buf = Buffer(tokenize(categorize('\\sect  \t   \n\t \n{bingbang}')))
    >>> _ = next(buf)
    >>> read_command(buf)
    ('sect', [])
    >>> buf = Buffer(tokenize(categorize('\\sect{ooheeeee}')))
    >>> _ = next(buf)
    >>> read_command(buf)
    ('sect', [BraceGroup('ooheeeee')])
    >>> buf = Buffer(tokenize(categorize(r'\item aaa {bbb} ccc\end{itemize}')))
    >>> read_command(buf, skip=1)
    ('item', [])
    >>> buf.peek()
    ' aaa '

    # >>> buf = Buffer(tokenize(categorize('\\sect abcd')))
    # >>> _ = next(buf)
    # >>> read_command(buf)
    # ('sect', ('a',))
    """
    for _ in range(skip):
        next(buf)

    name = next(buf)
    token = Token('', buf.position)
    if n_required_args < 0 and n_optional_args < 0:
        n_required_args, n_optional_args = SIGNATURES.get(name, (-1, -1))
    args = read_args(buf,
                     n_required_args,
                     n_optional_args,
                     tolerance=tolerance,
                     mode=mode)
    return name, args
Пример #6
0
def tokenize_string(text, prev=None):
    r"""Process a string of text

    :param Buffer text: iterator over line, with current position
    :param Union[None,iterable,str] delimiters: defines the delimiters

    >>> tokenize_string(categorize('hello'))
    'hello'
    >>> b = categorize(r'hello again\command')
    >>> tokenize_string(b)
    'hello again'
    >>> print(b.peek())
    \
    >>> print(tokenize_string(categorize(r'0 & 1\\\command')))
    0 & 1
    """
    result = Token('', text.position, category=TC.Text)
    while text.hasNext() and text.peek().category not in (
            CC.Escape, CC.GroupBegin, CC.GroupEnd, CC.MathSwitch,
            CC.BracketBegin, CC.BracketEnd, CC.Comment):
        result += next(text)
    return result
Пример #7
0
 def search_regex(self, pattern):
     for node in self.text:
         for match in re.finditer(pattern, node):
             body = match.group()  # group() returns the full match
             start = match.start()
             yield Token(body, node.position + start)