def test_unescape(self): from clld.lib.bibtex import unescape, u_unescape self.assertEqual(unescape(binary_type("\\ss \xef".encode('latin1'))), 'ß\xef') self.assertEqual(unescape("\\ss "), 'ß') self.assertEqual(u_unescape('?[\\u123] ?[\\u1234]'), '{ \u04d2') s = '\u2013' self.assertEqual(s, unescape(s)) self.assertEqual(unescape('?[\\u65533]'), '\ufffd')
def unescape(string): """transform latex escape sequences of type \`\ae into unicode """ def _delatex(s): try: t = str(s) result = t.decode('latex+latin1') except UnicodeEncodeError: # pragma: no cover result = string u_result = unicode(result) return u_result res = u_unescape(_delatex(stripctrlchars(unicode(string).strip()))) for symbol in sorted(SYMBOLS.keys(), key=lambda s: len(s)): res = res.replace(symbol, SYMBOLS[symbol]) if '\\' not in res: res = res.replace('{', '') res = res.replace('}', '') res = res.replace('\\\\&{} ', '& ') return res