def printLatexNodes(self, text): print("print " + text) w = LatexWalker(text) (nodelist, pos, len_) = w.get_latex_nodes(pos=0) print("len_ " + str(len_)) self._printLatexNodes(nodelist)
def test_input(self): latex = r'''ABCDEF fdksanfkld safnkd anfklsa \input{test_input_1.tex} MORENKFDNSN''' correct_text = r'''ABCDEF fdksanfkld safnkd anfklsa hi there! This is an equation: x + y i = 0 where i is the imaginary unit. MORENKFDNSN''' testdir = os.path.realpath(os.path.abspath(os.path.dirname(__file__))) l2t = LatexNodes2Text() l2t.set_tex_input_directory(testdir) output = l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]) self.assertEqualUpToWhitespace(output, correct_text) latex = r'''ABCDEF fdksanfkld safnkd anfklsa \input{test_input_1} MORENKFDNSN''' self.assertEqualUpToWhitespace( l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), correct_text) latex = r'''ABCDEF fdksanfkld safnkd anfklsa \input{../test_input_1} MORENKFDNSN''' correct_text_unsafe = correct_text # as before correct_text_safe = r'''ABCDEF fdksanfkld safnkd anfklsa MORENKFDNSN''' # make sure that the \input{} directive failed to include the file. l2t = LatexNodes2Text() l2t.set_tex_input_directory(os.path.join(testdir, 'dummy')) self.assertEqualUpToWhitespace( l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), correct_text_safe) # but without the strict_input flag, it can access it. l2t.set_tex_input_directory(os.path.join(testdir, 'dummy'), strict_input=False) self.assertEqualUpToWhitespace( l2t.nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), correct_text_unsafe)
def test_accents(self): self.assertEqual( LatexNodes2Text().nodelist_to_text(LatexWalker(r"Fran\c cais").get_latex_nodes()[0]), '''Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais''' ) self.assertEqual( LatexNodes2Text().nodelist_to_text(LatexWalker(r"Fr\'en{\'{e}}tique").get_latex_nodes()[0]), '''Fr\N{LATIN SMALL LETTER E WITH ACUTE}n\N{LATIN SMALL LETTER E WITH ACUTE}tique''' )
def test_accents(self): self.assertEqual( LatexNodes2Text().nodelist_to_text( LatexWalker(r"Fran\c cais").get_latex_nodes()[0]), '''Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais''') self.assertEqual( LatexNodes2Text().nodelist_to_text( LatexWalker(r"Fr\'en{\'{e}}tique").get_latex_nodes()[0]), '''Fr\N{LATIN SMALL LETTER E WITH ACUTE}n\N{LATIN SMALL LETTER E WITH ACUTE}tique''' ) self.assertEqual( LatexNodes2Text(math_mode='with-delimiters').nodelist_to_text( LatexWalker(r"$1 \not= 2$").get_latex_nodes()[0]), '''$1 {} 2$'''.format( unicodedata.normalize('NFC', "=\N{COMBINING LONG SOLIDUS OVERLAY}")))
def test_keep_braced_groups(self): self.assertEqual( LatexNodes2Text(keep_braced_groups=True) .nodelist_to_text(LatexWalker(r"\textit{Voil\`a du texte}. Il est \'{e}crit {en fran{\c{c}}ais}") .get_latex_nodes()[0]), '''Voil\N{LATIN SMALL LETTER A WITH GRAVE} du texte. Il est \N{LATIN SMALL LETTER E WITH ACUTE}crit {en fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais}''' ) self.assertEqual( LatexNodes2Text(keep_braced_groups=True, keep_braced_groups_minlen=4) .nodelist_to_text(LatexWalker(r"A{XYZ}{ABCD}").get_latex_nodes()[0]), '''AXYZ{ABCD}''' ) self.assertEqual( LatexNodes2Text(keep_braced_groups=True, keep_braced_groups_minlen=0) .nodelist_to_text(LatexWalker(r"{A}{XYZ}{ABCD}").get_latex_nodes()[0]), '''{A}{XYZ}{ABCD}''' )
def test_get_latex_environment(self): latextext = r'''Text and \`accent and \textbf{bold text} and $\vec b$ more stuff for Fran\c cois \begin{enumerate}[(i)] \item Hi there! % here goes a comment \item[a] Hello! @@@ \end{enumerate} Indeed thanks to \cite[Lemma 3]{Author}, we know that... Also: {\itshape some italic text}. ''' lw = LatexWalker(latextext, tolerant_parsing=False) p = latextext.find(r'\begin{enumerate}') self.assertEqual( lw.get_latex_environment(pos=p, environmentname='enumerate'), ( LatexEnvironmentNode('enumerate', [ LatexCharsNode('\n'), LatexMacroNode('item', None, [], macro_post_space=' '), LatexCharsNode('Hi there! '), LatexCommentNode(' here goes a comment'), LatexMacroNode('item', LatexGroupNode( [LatexCharsNode('a')]), []), LatexCharsNode(' Hello! @@@\n ') ], [LatexGroupNode([LatexCharsNode('(i)')])], []), p, latextext.find(r'\end{enumerate}') + len(r'\end{enumerate}') - p, )) self.assertEqual(lw.get_latex_environment(pos=p), ( LatexEnvironmentNode('enumerate', [ LatexCharsNode('\n'), LatexMacroNode('item', None, [], macro_post_space=' '), LatexCharsNode('Hi there! '), LatexCommentNode(' here goes a comment'), LatexMacroNode('item', LatexGroupNode([LatexCharsNode('a')]), []), LatexCharsNode(' Hello! @@@\n ') ], [LatexGroupNode([LatexCharsNode('(i)')])], []), p, latextext.find(r'\end{enumerate}') + len(r'\end{enumerate}') - p, )) with self.assertRaises(LatexWalkerParseError): dummy = lw.get_latex_environment(pos=p, environmentname='XYZNFKLD-WRONG')
def test_get_latex_maybe_optional_arg(self): latextext = r'''Text and \`accent and \textbf{bold text} and $\vec b$ more stuff for Fran\c cois \begin{enumerate}[(i)] \item Hi there! % here goes a comment \item[a] Hello! @@@ \end{enumerate} Indeed thanks to \cite[Lemma 3]{Author}, we know that... ''' lw = LatexWalker(latextext, tolerant_parsing=False) p = latextext.find(r'\textbf') + len(r'\textbf') self.assertEqual(lw.get_latex_maybe_optional_arg(pos=p), None) p = latextext.find(r'\cite') + len(r'\cite') self.assertEqual(lw.get_latex_maybe_optional_arg(pos=p), ( LatexGroupNode([LatexCharsNode('Lemma 3')]), p, 9, ))
def test_get_latex_braced_group(self): latextext = r'''Text and \`accent and \textbf{bold text} and $\vec b$ more stuff for Fran\c cois \begin{enumerate}[(i)] \item Hi there! % here goes a comment \item[a] Hello! @@@ \end{enumerate} Indeed thanks to \cite[Lemma 3]{Author}, we know that... Also: {\itshape some italic text}. ''' lw = LatexWalker(latextext, tolerant_parsing=False) p = latextext.find(r'Also: {') + len( 'Also:') # points on space after 'Also:' self.assertEqual(lw.get_latex_braced_group(pos=p, brace_type='{'), ( LatexGroupNode([ LatexMacroNode('itshape', None, [], macro_post_space=' '), LatexCharsNode('some italic text') ]), p + 1, len('{\itshape some italic text}'), )) self.assertEqual(lw.get_latex_braced_group( pos=p + 1, brace_type='{'), ( LatexGroupNode([ LatexMacroNode('itshape', None, [], macro_post_space=' '), LatexCharsNode('some italic text') ]), p + 1, len('{\itshape some italic text}'), )) p = latextext.find(r'[(i)]') self.assertEqual(lw.get_latex_braced_group(pos=p, brace_type='['), ( LatexGroupNode([LatexCharsNode('(i)')]), p, 5, ))
def test_basic(self): self.assertEqual( LatexNodes2Text().nodelist_to_text(LatexWalker(r'\textbf{A}').get_latex_nodes()[0]), 'A' ) latex = r'''\textit{hi there!} This is {\em an equation}: \begin{equation} x + y i = 0 \end{equation} where $i$ is the imaginary unit. ''' self.assertEqualUpToWhitespace( LatexNodes2Text().nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), r'''hi there! This is an equation: x + y i = 0 where i is the imaginary unit. ''' ) self.assertEqualUpToWhitespace( LatexNodes2Text(keep_inline_math=True).nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), r'''hi there! This is an equation: x + y i = 0 where $i$ is the imaginary unit. ''' ) self.assertEqual( LatexNodes2Text().nodelist_to_text(LatexWalker(latex).get_latex_nodes()[0]), LatexNodes2Text().latex_to_text(latex) )
def test_basic(self): self.assertEqual( LatexNodes2Text().nodelist_to_text( LatexWalker(r'\textbf{A}').get_latex_nodes()[0]), 'A') latex = r'''\textit{hi there!} This is {\em an equation}: \begin{equation} x + y i = 0 \end{equation} where $i$ is the ``imaginary unit.'' ''' self.assertEqualUpToWhitespace( LatexNodes2Text().nodelist_to_text( LatexWalker(latex).get_latex_nodes()[0]), u'''hi there! This is an equation: x + y i = 0 where i is the “imaginary unit.” ''') self.assertEqualUpToWhitespace( LatexNodes2Text(math_mode='with-delimiters').nodelist_to_text( LatexWalker(latex).get_latex_nodes()[0]), u'''hi there! This is an equation: \\begin{equation} x + y i = 0 \\end{equation} where $i$ is the “imaginary unit.” ''') self.assertEqual( LatexNodes2Text().nodelist_to_text( LatexWalker(latex).get_latex_nodes()[0]), LatexNodes2Text().latex_to_text(latex))
def test_get_latex_nodes(self): latextext = r'''Text and \`accent and \textbf{bold text} and $\vec b$ more stuff for Fran\c cois \begin{enumerate}[(i)] \item Hi there! % here goes a comment \item[a] Hello! @@@ \end{enumerate} Indeed thanks to \cite[Lemma 3]{Author}, we know that... Also: {\itshape some italic text}. ''' lw = LatexWalker(latextext, tolerant_parsing=False) #lw.get_latex_nodes(pos=0,stop_upon_closing_brace=None,stop_upon_end_environment=None, # stop_upon_closing_mathmode=None) p = latextext.find('Also: {') self.assertEqual(lw.get_latex_nodes(pos=p), ([ LatexCharsNode('Also: '), LatexGroupNode([ LatexMacroNode('itshape', None, [], macro_post_space=' '), LatexCharsNode('some italic text') ]), LatexCharsNode('.') ], p, len(latextext) - p - 1)) # trailing '\n' is not included p = latextext.find('Also: {') + len( 'Also: {') # points inside right after open brace self.assertEqual( lw.get_latex_nodes(pos=p, stop_upon_closing_brace='}'), ([ LatexMacroNode('itshape', None, [], macro_post_space=' '), LatexCharsNode('some italic text') ], p, len('\itshape some italic text}'))) # test our own macro lists etc. pindeed = latextext.find('Indeed thanks to') lineindeed = latextext[pindeed:latextext.find('\n', pindeed)] lw2 = LatexWalker(lineindeed, tolerant_parsing=False, macro_dict={'cite': MacrosDef('cite', False, 4)}) self.assertEqual(lw2.get_latex_nodes(pos=0), ([ LatexCharsNode('Indeed thanks to '), LatexMacroNode('cite', None, [ LatexCharsNode('['), LatexCharsNode('L'), LatexCharsNode('e'), LatexCharsNode('m'), ]), LatexCharsNode('ma 3]'), LatexGroupNode([LatexCharsNode('Author')]), LatexCharsNode(', we know that...'), ], 0, len(lineindeed)))
def test_errors(self): latextext = get_test_latex_data_with_possible_inconsistencies() lw = LatexWalker(latextext, tolerant_parsing=False) with self.assertRaises(LatexWalkerParseError): dummy = lw.get_latex_nodes() lwOk = LatexWalker(latextext, tolerant_parsing=True) # make sure that it goes through without raising: try: lwOk.get_latex_nodes() except LatexWalkerParseError as e: # should not raise this. self.fail( u"get_latex_nodes() raised LatexWalkerParseError, but it shouldn't have in " u"tolerant parsing mode!\n" + unicode(e))
def test_get_token(self): latextext = r'''Text \`accent and \textbf{bold text} and $\vec b$ vector \& also Fran\c cois \begin{enumerate}[(i)] \item Hi there! % here goes a comment \item[a] Hello! @@@ \end{enumerate} ''' lw = LatexWalker(latextext) self.assertEqual( lw.get_token(pos=0), LatexToken(tok='char', arg='T', pos=0, len=1, pre_space='')) self.assertEqual( lw.get_token(pos=1), LatexToken(tok='char', arg='e', pos=1, len=1, pre_space='')) p = latextext.find(r'\`') self.assertEqual( lw.get_token(pos=p), LatexToken(tok='macro', arg='`', pos=p, len=2, pre_space='')) p = latextext.find(r'\textbf') - 1 # pre space self.assertEqual( lw.get_token(pos=p), LatexToken(tok='macro', arg='textbf', pos=p + 1, len=7, pre_space=' ')) p = latextext.find(r'\vec') # post-space self.assertEqual( lw.get_token(pos=p), LatexToken(tok='macro', arg='vec', pos=p, len=5, pre_space='', post_space=' ')) p = latextext.find(r'\&') - 1 # pre-space and *no* post-space self.assertEqual( lw.get_token(pos=p), LatexToken(tok='macro', arg='&', pos=p + 1, len=2, pre_space=' ', post_space='')) p = latextext.find(r'\begin') self.assertEqual( lw.get_token(pos=p, environments=False), LatexToken(tok='macro', arg='begin', pos=p, len=6, pre_space='')) p = latextext.find(r'\begin') self.assertEqual( lw.get_token(pos=p), LatexToken(tok='begin_environment', arg='enumerate', pos=p, len=len(r'\begin{enumerate}'), pre_space='')) p = latextext.find(r'@@@') + 3 # pre space to \end self.assertEqual( lw.get_token(pos=p), LatexToken(tok='end_environment', arg='enumerate', pos=p + 6, len=len(r'\end{enumerate}'), pre_space='\n ')) p = latextext.find(r'%') - 1 self.assertEqual( lw.get_token(pos=p), LatexToken(tok='comment', arg=' here goes a comment', pos=p + 1, len=len('% here goes a comment\n'), pre_space=' ', post_space='\n')) p = latextext.find(r'{') self.assertEqual( lw.get_token(pos=p), LatexToken(tok='brace_open', arg='{', pos=p, len=1, pre_space='')) p = latextext.find(r'}') self.assertEqual( lw.get_token(pos=p), LatexToken(tok='brace_close', arg='}', pos=p, len=1, pre_space='')) p = latextext.find(r'[') self.assertEqual( lw.get_token(pos=p, brackets_are_chars=False), LatexToken(tok='brace_open', arg='[', pos=p, len=1, pre_space='')) p = latextext.find(r']') self.assertEqual( lw.get_token(pos=p, brackets_are_chars=False), LatexToken(tok='brace_close', arg=']', pos=p, len=1, pre_space='')) p = latextext.find(r'$') self.assertEqual( lw.get_token(pos=p), LatexToken(tok='char', arg='$', pos=p, len=1, pre_space='')) lw2 = LatexWalker(latextext, keep_inline_math=True) p = latextext.find(r'$') self.assertEqual( lw2.get_token(pos=p), LatexToken(tok='mathmode_inline', arg='$', pos=p, len=1, pre_space=''))
re_single_quote = re.compile(r"`|'") def is_symbol(node, LatexMacroNode): # TODO: improve this function return len(node.macroname) == 1 if __name__ == "__main__": # Read latex file with open(input_tex, "r") as fp: text = fp.read() # Parse latex walker = LatexWalker(text) # Parse nodes nodes, pos, length = walker.get_latex_nodes() # Write output to a file with open(output_txt, "w") as fp: # Number of titles num_titles = 0 # If previous node is a citation no_new_paragraph = False # Loop over all nodes for node in nodes: # TODO: Implement more rules and process environments recursively
def test2(self): w = LatexWalker(r"""\[\int_{a}^{b} x^2 \,dx \]""") (nodelist, pos, len_) = w.get_latex_nodes(pos=0) for i in range(len(nodelist[0].nodelist)): print(nodelist[0].nodelist[i])
def traverseLatex(self, text): w = LatexWalker(text) (nodelist, pos, len_) = w.get_latex_nodes(pos=0) self._traverseLatex(nodelist)
def findMathNode(self): w = LatexWalker(self.equation) (nodelist, pos, len_) = w.get_latex_nodes(pos=0) return self._findMathNodeInList(nodelist)
def test_get_latex_expression(self): latextext = r'''Text and \`accent and \textbf{bold text} and $\vec b$ more stuff for Fran\c cois \begin{enumerate}[(i)] \item Hi there! % here goes a comment \item[a] Hello! @@@ \end{enumerate} ''' lw = LatexWalker(latextext, tolerant_parsing=True) self.assertEqual(lw.get_latex_expression(pos=0), ( LatexCharsNode('T'), 0, 1, )) p = latextext.find(r'\`') self.assertEqual(lw.get_latex_expression(pos=p), ( LatexMacroNode('`', None, []), p, 2, )) p = latextext.find(r'{') self.assertEqual(lw.get_latex_expression(pos=p), ( LatexGroupNode([LatexCharsNode('bold text')]), p, 11, )) p = latextext.find(r'%') # check: correctly skips comments self.assertEqual(lw.get_latex_expression(pos=p), ( LatexMacroNode('item', None, []), p + len('% here goes a comment\n'), 5, )) p = latextext.find(r'%') # check: correctly skips comments self.assertEqual(lw.get_latex_expression(pos=p), ( LatexMacroNode('item', None, []), p + len('% here goes a comment\n'), 5, )) # check correct behavior if directly on brace close p = latextext.find(r'}') self.assertEqual(lw.get_latex_expression(pos=p, strict_braces=True), ( LatexCharsNode(''), p, 0, )) lw2 = LatexWalker(latextext, tolerant_parsing=False) self.assertEqual(lw2.get_latex_expression(pos=p, strict_braces=False), ( LatexCharsNode(''), p, 0, )) with self.assertRaises(LatexWalkerParseError): dummy = lw2.get_latex_expression(pos=p, strict_braces=True)
from pylatexenc.latexwalker import LatexWalker, LatexEnvironmentNode import pylatexenc import sys fname = sys.argv[1] with open(fname, "r") as f: tex = f.read() print(fname) walker = LatexWalker(tex) nodelist, pos, leen = walker.get_latex_nodes() charslist = [] for node in nodelist: if node.isNodeType(pylatexenc.latexwalker.LatexEnvironmentNode): for subnode in node.nodelist: if subnode.isNodeType(pylatexenc.latexwalker.LatexCharsNode): charslist.append(str(subnode.chars).strip()) fullst = " ".join(charslist).replace("\n"," ") print(fullst)