def test_basic_2b(self): u = UnicodeToLatexEncoder(replacement_latex_protection='none') input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%." self.assertEqual( u.unicode_to_latex(input), "''\\`A votre sant\\'e!'' s'exclama le ma\\^\\itre de maison \\`a 100\\%." )
def test_rules_03(self): u = UnicodeToLatexEncoder(conversion_rules=['unicode-xml']) input = "* \"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama\N{SUPERSCRIPT TWO} le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%." self.assertEqual( u.unicode_to_latex(input), "{\\ast} \"\\`{A} votre sant\\'{e}!\" s{\\textquotesingle}exclama{^2} le ma\\^{\\i}tre de maison \\`{a} 100\\%." )
def test_basic_2d(self): u = UnicodeToLatexEncoder(non_ascii_only=False) ascii_chars_convert = " \" # $ % & \\ _ { } ~ " self.assertEqual( u.unicode_to_latex(ascii_chars_convert), " '' \\# \\$ \\% \\& {\\textbackslash} \\_ \\{ \\} {\\textasciitilde} " )
def test_rules_01(self): def acallable(s, pos): if s[pos] == "\N{LATIN SMALL LETTER E WITH ACUTE}": return (1, r"{\'{e}}") if s.startswith('...', pos): return (3, r"\ldots") return None u = UnicodeToLatexEncoder(conversion_rules=[ latexencode.UnicodeToLatexConversionRule( latexencode.RULE_DICT, { ord("\N{LATIN CAPITAL LETTER A WITH GRAVE}"): r"{{\`{A}}}", ord("%"): r"\textpercent", }), latexencode.UnicodeToLatexConversionRule(latexencode.RULE_REGEX, [ (re.compile('v(otre)'), r'n\1'), (re.compile("s'exclama", flags=re.I), r"s'exprima"), (re.compile('\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'), r"{\^i}"), ]), 'unicode-xml', # expand built-in rule names latexencode.UnicodeToLatexConversionRule(latexencode.RULE_CALLABLE, acallable), ]) input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison ... \N{LATIN SMALL LETTER A WITH GRAVE} 100%." self.assertEqual( u.unicode_to_latex(input), "\"{{\\`{A}}} notre sant\\'{e}!\" s'exprima le ma{\\^i}tre de maison {\\ldots} \\`{a} 100{\\textpercent}." )
def escape_special_chars(self, suppress_warnings: bool = True) -> None: """Escapes special characters in the bibliographic data. Special characters should be escaped to ensure proper rendering in LaTeX documents. This function leverages the existing implementation of the `pylatexenc` module to do said conversion. The only fields exempted from the conversion are the `file` and `url` fields of the `Entry.data` dictionary. Args: suppress_warnings: if True, warnings generated by the `pylatexenc` modules will be suppressed. This argument will be overwritten if the logging level is set to `logging.DEBUG`. """ enc = UnicodeToLatexEncoder( non_ascii_only=True, replacement_latex_protection="braces-all", unknown_char_policy="keep", unknown_char_warning=not suppress_warnings or LOGGER.isEnabledFor(logging.DEBUG), ) for key, value in self.data.items(): if key in ("file", "url"): # do NOT these fields and keep any special characters self.data[key] = value continue if isinstance(value, str): self.data[key] = enc.unicode_to_latex(value)
def test_basic_1(self): u = UnicodeToLatexEncoder(non_ascii_only=True, replacement_latex_protection='braces-all') input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%." self.assertEqual( u.unicode_to_latex(input), "\"{\\`A} votre sant{\\'e}!\" s'exclama le ma{\\^\\i}tre de maison {\\`a} 100%." )
def test_basic_3b(self): test_unknown_chars = "A unicode character: \N{THAI CHARACTER THO THONG}" # generates warnings -- that's good with self.assertLogs(logger='pylatexenc.latexencode', level='WARNING') as cm: u = UnicodeToLatexEncoder(unknown_char_policy='replace') self.assertEqual(u.unicode_to_latex(test_unknown_chars), "A unicode character: {\\bfseries ?}")
def test_basic_callable_replacement_latex_protection(self): u = UnicodeToLatexEncoder( replacement_latex_protection=lambda s: '{***{' + s + '}***}') input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%." self.assertEqual( u.unicode_to_latex(input), "{***{''}***}{***{\\`A}***} votre sant{***{\\'e}***}!{***{''}***} s'exclama le ma{***{\\^\\i}***}tre de maison {***{\\`a}***} 100{***{\\%}***}." )
def test_rules_02(self): # based on test_basic_0() u = UnicodeToLatexEncoder(conversion_rules=['defaults']) #u = UnicodeToLatexEncoder() input = "* \"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama\N{SUPERSCRIPT TWO} le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%." self.assertEqual( u.unicode_to_latex(input), "* ''\\`A votre sant\\'e!'' s'exclama{\\texttwosuperior} le ma{\\^\\i}tre de maison \\`a 100\\%." )
def test_basic_3c(self): test_unknown_chars = "A unicode character: \N{THAI CHARACTER THO THONG}" u = UnicodeToLatexEncoder(unknown_char_policy='unihex', unknown_char_warning=False) self.assertEqual( u.unicode_to_latex(test_unknown_chars), "A unicode character: \\ensuremath{\\langle}\\texttt{U+0E18}\\ensuremath{\\rangle}" )
def test_basic_2a(self): # Issue #44 u = UnicodeToLatexEncoder( replacement_latex_protection='braces-after-macro') input = "Jabłoński, François, ⟨.⟩, ~" self.assertEqual( u.unicode_to_latex(input), "Jab\\l{}o\\'nski, Fran\\c{c}ois, \\ensuremath{\\langle}.\\ensuremath{\\rangle}, \\textasciitilde{}" )
def __init__(self): conversion_rules = [ # our custom rules UnicodeToLatexConversionRule( RULE_REGEX, [ # double \\ needed, see UnicodeToLatexConversionRule (re.compile(r'\u1ec5'), r'\\~{\\^{{e}}}'), ]), # plus all the default rules 'defaults' ] self.u = UnicodeToLatexEncoder( conversion_rules=conversion_rules, replacement_latex_protection='braces-almost-all')
def test_latex_string_class(self): class LatexChunkList: def __init__(self): self.chunks = [] def __iadd__(self, s): self.chunks.append(s) return self u = UnicodeToLatexEncoder(latex_string_class=LatexChunkList, replacement_latex_protection='none') result = u.unicode_to_latex("A é → α") # result is an object of custom type LatexChunkList self.assertEqual(result.chunks, [ 'A', ' ', r'\'e', ' ', r'\textrightarrow', ' ', r'\ensuremath{\alpha}' ])
def latex_encoder(): u = UnicodeToLatexEncoder( conversion_rules=[ UnicodeToLatexConversionRule(rule_type=RULE_REGEX, rule=extra_rules()), 'defaults' ] ) return u.unicode_to_latex
def test_issue_no21(self): # test for https://github.com/phfaist/pylatexenc/issues/21 def capitalize_acronyms(s, pos): if s[pos] in ('{', '}'): # preserve existing braces return (1, s[pos]) m = re.compile(r'\b[A-Z]{2,}\w*\b').match(s, pos) if m is None: return None return (m.end() - m.start(), "{" + m.group() + "}") u = UnicodeToLatexEncoder(conversion_rules=[ latexencode.UnicodeToLatexConversionRule(latexencode.RULE_CALLABLE, capitalize_acronyms), ] + latexencode.get_builtin_conversion_rules('defaults')) input = "Title with {Some} ABC acronyms LIKe this." self.assertEqual(u.unicode_to_latex(input), "Title with {Some} {ABC} acronyms {LIKe} this.") u = UnicodeToLatexEncoder(conversion_rules=[ latexencode.UnicodeToLatexConversionRule( latexencode.RULE_REGEX, [ (re.compile(r'([{}])'), r'\1'), # keep existing braces (re.compile(r'\b([A-Z]{2,}\w*)\b'), r'{\1}'), ]), ] + latexencode.get_builtin_conversion_rules('defaults')) input = "Title with {Some} ABC acronyms LIKe this." self.assertEqual(u.unicode_to_latex(input), "Title with {Some} {ABC} acronyms {LIKe} this.")
def escape_special_chars(self, suppress_warnings=True): """Escapes special characters. Special characters should be escaped to ensure proper rendering in LaTeX documents. This function leverages the existing implementation of the pylatexenc module. Args: suppress_warnings (bool): if True, suppresses warnings. """ enc = UnicodeToLatexEncoder( non_ascii_only=True, replacement_latex_protection='braces-all', unknown_char_policy='keep', unknown_char_warning=not suppress_warnings or LOGGER.isEnabledFor(10)) # 10 = DEBUG logging level for key, value in self.data.items(): if key in ('ID', 'file'): # do NOT these fields and keep any special characters self.data[key] = value continue if isinstance(value, str): self.data[key] = enc.unicode_to_latex(value)
class BibtexWriter(Writer): latex_encode = UnicodeToLatexEncoder( replacement_latex_protection="braces-after-macro", non_ascii_only=True).unicode_to_latex def _encode(self, text): return self.latex_encode(text) def _write_persons(self, stream, persons, role): if len(persons) > 10: self._write_field( stream, role, self._format_name(stream, persons[0]) + " and others") else: super(BibtexWriter, self)._write_persons(stream, persons, role)
def latex_encode(text, contains_math=False): """Encode a string for use in a LaTeX format. Args: contains_math (bool): when True, math environments delimited by $...$ or \\(...\\) are preserved to avoid double escaping. Note that $$...$$ is not handled. """ if text is None: return None encode = UnicodeToLatexEncoder( replacement_latex_protection="braces-after-macro").unicode_to_latex if not (contains_math and ("$" in text or r"\(" in text)): return encode(text) parts = MATH_EXPRESSION_REGEX.split(text) encoded_text = "".join( encode(part) if i % 2 == 0 else part for i, part in enumerate(parts)) return encoded_text
import os import time import locale import json import argparse from tika import parser from pylatexenc.latexencode import UnicodeToLatexConversionRule, UnicodeToLatexEncoder, RULE_REGEX encoder = UnicodeToLatexEncoder(conversion_rules=[ UnicodeToLatexConversionRule(RULE_REGEX, []), 'defaults' ]) config_path = os.path.join(os.getcwd(), "config.json") config_template = { "title": {}, "authors": [], "packages": [], "commands": {}, "environments": {} } tex_project_template = \ """ \\documentclass[12pt]{{article}}
# type hints from typing import ( Optional, Dict, List, Any, Union, TypeVar, Generic, Tuple, Set ) encoder: UnicodeToLatexEncoder = \ UnicodeToLatexEncoder(unknown_char_policy='replace', replacement_latex_protection="braces", non_ascii_only=True) @dataclass class XMLItem(ABC): """ Base XML wrapper class. This item consists on a dataclass with basically two fields: + `tag`, containing the XML tag identifier. + `item_tag`, containing the XML tag itself. This abstract class defines two abstract methods that must be override: - :func:`parse` - :func:`to_table`
def test_basic_2c(self): u = UnicodeToLatexEncoder(non_ascii_only=True) ascii_chars_convert = " \" # $ % & \\ _ { } ~ " self.assertEqual(u.unicode_to_latex(ascii_chars_convert), ascii_chars_convert)
def test_all(self): loglevel = logging.getLogger().level logging.getLogger().setLevel(logging.CRITICAL) u = UnicodeToLatexEncoder( unknown_char_policy='fail', replacement_latex_protection='braces-almost-all') def fn(x, bdir=os.path.realpath(os.path.abspath( os.path.dirname(__file__)))): return os.path.join(bdir, x) with codecs.open(fn('_tmp_uni_chars_test.temp.txt'), 'w', encoding='utf-8') as testf: for i in range(0x10FFFF): # iter over all valid unicode characters try: chrname = unicodedata.name(unichr( i)) # test if valid, i.e., it has a UNICODE NAME except ValueError: continue line = "0x%04X %-50s |%s|\n" % (i, '[' + chrname + ']', unichr(i)) # try to encode it using our unicode_to_latex routines try: enc = u.unicode_to_latex(line) except ValueError: continue testf.write(enc) with codecs.open(fn('uni_chars_test_previous.txt'), 'r', encoding='utf-8') as reff, \ codecs.open(fn('_tmp_uni_chars_test.temp.txt'), 'r', encoding='utf-8') as testf: a = reff.readlines() b = testf.readlines() logging.getLogger().setLevel(loglevel) logger = logging.getLogger(__name__) # only check up to the supported unicode range if sys.maxunicode < 0x10FFFF: logger.warning( "Only checking up to unicode U+%X, your python build doesn't support higher", sys.maxunicode) afiltered = [ aline for aline in a if int(aline[:aline.find(' ')], 0) < sys.maxunicode ] a = afiltered s = difflib.unified_diff(a, b, fromfile='uni_chars_test_previous.txt', tofile='_tmp_uni_chars_test.temp.txt') diffmsg = "".join(list(s)).strip() if diffmsg: print(diffmsg) raise self.failureException( "Unicode coverage tests failed. See full diff above.")