def forward_until_new(s): """Catch the first non-whitespace character""" t = TokenWithPosition('', s.peek().position) while (s.hasNext() and any( [s.peek().startswith(substr) for substr in string.whitespace]) and not t.strip(" ").endswith('\n')): t += s.forward(1) return t
def tokenize_math(text): r"""Prevents math from being tokenized. :param Buffer text: iterator over line, with current position >>> b = Buffer('$$\min_x$$ \command') >>> tokenize_math(b) '$$\\min_x$$' """ def escaped_dollar(): return text.peek() == '$' and result[-1] == '\\' def end_detected(): return (text.peek((0, len(starter))) == starter and not escaped_dollar()) result = TokenWithPosition('', text.position) if text.startswith('$'): starter = '$$' if text.startswith('$$') else '$' result += text.forward(len(starter)) while text.hasNext() and not end_detected(): result += next(text) if not text.startswith(starter): raise EOFError('Expecting %s. Instead got %s' % ( starter, text.peek((0, 5)))) result += text.forward(len(starter)) return result
def tokenize_string(text, delimiters=None): r"""Process a string of text :param Buffer text: iterator over line, with current position :param Union[None,iterable,str] delimiters: defines the delimiters >>> tokenize_string(Buffer('hello')) 'hello' >>> b = Buffer(r'hello again\command') >>> tokenize_string(b) 'hello again' >>> print(b.peek()) \ >>> print(tokenize_string(Buffer(r'0 & 1 \\\command'))) 0 & 1 \\ """ if delimiters is None: delimiters = ALL_TOKENS result = TokenWithPosition('', text.position) for c in text: if c == '\\' and str(text.peek()) in delimiters and str( c + text.peek()) not in delimiters: c += next(text) elif str(c) in delimiters: # assumes all tokens are single characters text.backward(1) return result result += c if text.peek((0, 2)) == '\\\\': result += text.forward(2) if text.peek((0, 2)) == '\n\n': result += text.forward(2) return result return result
def read_tex(src): r"""Read next expression from buffer :param Buffer src: a buffer of tokens """ c = next(src) if c.startswith('$'): name = '$$' if c.startswith('$$') else '$' return TexEnv(name, [c[len(name):-len(name)]], nobegin=True) if c == '\\': if src.peek().startswith('item '): mode, expr = 'command', TexCmd(src.peek()[:4], (), TokenWithPosition.join(next(src).split(' ')[1:], glue=' ').strip()) elif src.peek() == 'begin': mode, expr = next(src), TexEnv(Arg.parse(src.forward(3)).value) else: mode, candidate, expr = 'command', next(src), None for i, c in enumerate(candidate): if c.isspace(): expr = TexCmd(candidate[:i], (), candidate[i+1:]) break if not expr: expr = TexCmd(candidate) while src.peek() in ARG_START_TOKENS: expr.args.append(read_tex(src)) if mode == 'begin': read_env(src, expr) if src.startswith('$'): expr.add_contents(read_tex(src)) return expr if c.startswith('\\'): return TexCmd(c[1:]) if c in ARG_START_TOKENS: return read_arg(src, c) return c
def read_tex(src): r"""Read next expression from buffer :param Buffer src: a buffer of tokens """ c = next(src) if c.startswith('$'): name = '$$' if c.startswith('$$') else '$' expr = TexEnv(name, [], nobegin=True) return read_math_env(src, expr) if c.startswith('\\'): command = TokenWithPosition(c[1:], src.position) if command == 'item': extra = src.forward_until(lambda string: any( [string.startswith(s) for s in {'\n', '\end', '\item'}])) mode, expr = 'command', TexCmd( command, (), TokenWithPosition.join(extra.split(' '), glue=' ').strip()) elif command == 'begin': mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3) else: mode, expr = 'command', TexCmd(command) # TODO: allow only one line break # TODO: should really be handled by tokenizer candidate_index = src.num_forward_until(lambda s: not s.isspace()) src.forward(candidate_index) while src.peek() in ARG_START_TOKENS: expr.args.append(read_tex(src)) if not expr.args: src.backward(candidate_index) if mode == 'begin': read_env(src, expr) return expr if c in ARG_START_TOKENS: return read_arg(src, c) return c
def tokenize_math(text): r"""Prevents math from being tokenized. :param Buffer text: iterator over line, with current position >>> b = Buffer('$\min_x$ \command') >>> tokenize_math(b) '$' >>> b = Buffer('$$\min_x$$ \command') >>> tokenize_math(b) '$$' """ if text.startswith('$') and (text.position == 0 or text.peek(-1) != '\\'): starter = '$$' if text.startswith('$$') else '$' return TokenWithPosition(text.forward(len(starter)), text.position)
def tokenize_line_comment(text): r"""Process a line comment :param Buffer text: iterator over line, with current position >>> tokenize_line_comment(Buffer('hello %world')) >>> tokenize_line_comment(Buffer('%hello world')) '%hello world' >>> tokenize_line_comment(Buffer('%hello\n world')) '%hello' """ result = TokenWithPosition('', text.position) if text.peek() == '%' and text.peek(-1) != '\\': result += text.forward(1) while text.peek() != '\n' and text.hasNext(): result += text.forward(1) return result
def read_tex(src): r"""Read next expression from buffer :param Buffer src: a buffer of tokens """ c = next(src) if c.startswith('%'): return c if c.startswith('$'): name = '$$' if c.startswith('$$') else '$' expr = TexEnv(name, [], nobegin=True) return read_math_env(src, expr) if c.startswith('\\'): command = TokenWithPosition(c[1:], src.position) if command == 'item': extra, arg = read_item(src) mode, expr = 'command', TexCmd(command, arg, extra) elif command == 'begin': mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3) else: mode, expr = 'command', TexCmd(command) # TODO: should really be handled by tokenizer candidate_index = src.num_forward_until(lambda s: not s.isspace()) src.forward(candidate_index) line_breaks = 0 while (src.peek() in ARG_START_TOKENS or (src.peek() == '\n') and line_breaks == 0): if src.peek() == '\n': # Advance buffer if first newline line_breaks += 1 next(src) else: line_breaks = 0 expr.args.append(read_tex(src)) if not expr.args: src.backward(candidate_index) if mode == 'begin': read_env(src, expr) return expr if c in ARG_START_TOKENS: return read_arg(src, c) return c
def read_tex(src): r"""Read next expression from buffer :param Buffer src: a buffer of tokens """ c = next(src) if c.startswith('%'): return c elif c.startswith('$'): name = '$$' if c.startswith('$$') else '$' expr = TexEnv(name, [], nobegin=True) return read_math_env(src, expr) elif c.startswith('\[') or c.startswith("\("): if c.startswith('\['): name = 'displaymath' begin = '\[' end = '\]' else: name = "math" begin = "\(" end = "\)" expr = TexEnv(name, [], nobegin=True, begin=begin, end=end) return read_math_env(src, expr) elif c.startswith('\\'): command = TokenWithPosition(c[1:], src.position) if command == 'item': contents, arg = read_item(src) mode, expr = 'command', TexCmd(command, contents, arg) elif command == 'begin': mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3) else: mode, expr = 'command', TexCmd(command) expr.args = read_args(src, expr.args) if mode == 'begin': read_env(src, expr) return expr if c in ARG_START_TOKENS: return read_arg(src, c) return c
def read_item(src): r"""Read the item content. There can be any number of whitespace characters between \item and the first non-whitespace character. However, after that first non-whitespace character, the item can only tolerate one successive line break at a time. \item can also take an argument. :param Buffer src: a buffer of tokens :return: contents of the item and any item arguments """ stringify = lambda s: TokenWithPosition.join(s.split(' '), glue=' ') def criterion(s): """Catch the first non-whitespace character""" return not any([s.startswith(substr) for substr in string.whitespace]) # Item argument such as in description environment arg = [] if src.peek() in ARG_START_TOKENS: c = next(src) arg.append(read_arg(src, c)) last = stringify(src.forward_until(criterion)) if last.startswith(' '): last = last[1:] extra = [last] while src.hasNext() and not src.startswith('\n\n') and \ not src.startswith('\item') and \ not src.startswith('\end') and \ not (hasattr(last, 'endswith') and last.endswith('\n\n') and len(extra) > 1): last = read_tex(src) extra.append(last) return extra, arg
def stringify(s): return TokenWithPosition.join(s.split(' '), glue=' ')
def read_tex(src): r"""Read next expression from buffer :param Buffer src: a buffer of tokens """ c = next(src) if c.startswith('%'): return c elif c.startswith('$'): name = '$$' if c.startswith('$$') else '$' expr = TexEnv(name, [], nobegin=True) return read_math_env(src, expr) elif c.startswith('\[') or c.startswith("\("): if c.startswith('\['): name = 'displaymath' begin = '\[' end = '\]' else: name = "math" begin = "\(" end = "\)" expr = TexEnv(name, [], nobegin=True, begin=begin, end=end) return read_math_env(src, expr) elif c.startswith('\\'): command = TokenWithPosition(c[1:], src.position) if command == 'item': extra, arg, stuff = read_item(src) mode, expr = 'command', TexCmd(command, arg, extra, stuff) elif command == 'begin': mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3) else: mode, expr = 'command', TexCmd(command) # TODO: should really be handled by tokenizer stuff_index, candidate_index = 0, src.num_forward_until( lambda s: not s.isspace()) while src.peek().isspace(): stuff_index += 1 expr.stuff.append(read_tex(src)) line_breaks = 0 while src.peek( ) in ARG_START_TOKENS or src.peek().isspace() and line_breaks == 0: space_index = src.num_forward_until(lambda s: not s.isspace()) if space_index > 0: line_breaks += 1 if src.peek((0, space_index)).count("\n") <= 1 and src.peek( space_index) in ARG_START_TOKENS: expr.stuff.append(read_tex(src)) else: line_breaks = 0 tex_text = read_tex(src) expr.args.append(tex_text) expr.stuff.append(tex_text) if not expr.args: if stuff_index > 0: del expr.stuff[-stuff_index:] src.backward(candidate_index) if mode == 'begin': read_env(src, expr) return expr if c in ARG_START_TOKENS: return read_arg(src, c) return c