Exemplo n.º 1
0
    def register(self,
                 unicode_text,
                 latex_text,
                 mode='text',
                 package=None,
                 decode=True,
                 encode=True):
        """Register a correspondence between *unicode_text* and *latex_text*.

        :param str unicode_text: A unicode character.
        :param bytes latex_text: Its corresponding LaTeX translation.
        :param str mode: LaTeX mode in which the translation applies
            (``'text'`` or ``'math'``).
        :param str package: LaTeX package requirements (currently ignored).
        :param bool decode: Whether this translation applies to decoding
            (default: ``True``).
        :param bool encode: Whether this translation applies to encoding
            (default: ``True``).
        """
        if package is not None:
            # TODO implement packages
            pass
        if mode == 'math':
            # also register text version
            self.register(unicode_text,
                          b'$' + latex_text + b'$',
                          mode='text',
                          package=package,
                          decode=decode,
                          encode=encode)
            # XXX for the time being, we do not perform in-math substitutions
            return
        # tokenize, and register unicode translation
        self.lexer.reset()
        self.lexer.state = 'M'
        tokens = tuple(self.lexer.get_tokens(latex_text, final=True))
        if decode:
            if tokens not in self.unicode_map:
                self.max_length = max(self.max_length, len(tokens))
                self.unicode_map[tokens] = unicode_text
            # also register token variant with brackets, if appropriate
            # for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc.
            # note: we do not remove brackets (they sometimes matter,
            # e.g. bibtex uses them to prevent lower case transformation)
            if (len(tokens) == 2 and tokens[0].name.startswith('control')
                    and tokens[1].name == 'chars'):
                alt_tokens = (
                    tokens[0],
                    lexer.Token('chars', b'{'),
                    tokens[1],
                    lexer.Token('chars', b'}'),
                )
                if alt_tokens not in self.unicode_map:
                    self.max_length = max(self.max_length, len(alt_tokens))
                    self.unicode_map[alt_tokens] = u"{" + unicode_text + u"}"
        if encode and unicode_text not in self.latex_map:
            assert len(unicode_text) == 1
            self.latex_map[unicode_text] = (latex_text, tokens)
Exemplo n.º 2
0
 def _get_latex_bytes_tokens_from_char(self, c):
     # if ascii, try latex equivalents
     # (this covers \, #, &, and other special LaTeX characters)
     if ord(c) < 128:
         try:
             return self.table.latex_map[c]
         except KeyError:
             pass
     # next, try input encoding
     try:
         bytes_ = c.encode(self.inputenc, 'strict')
     except UnicodeEncodeError:
         pass
     else:
         if self.binary_mode:
             return bytes_, (lexer.Token(name='chars', text=bytes_),)
         else:
             return c, (lexer.Token(name='chars', text=c),)
     # next, try latex equivalents of common unicode characters
     try:
         return self.table.latex_map[c]
     except KeyError:
         # translation failed
         if self.errors == 'strict':
             raise UnicodeEncodeError(
                 "latex",  # codec
                 c,  # problematic input
                 0, 1,  # location of problematic character
                 "don't know how to translate {0} into latex"
                 .format(repr(c)))
         elif self.errors == 'ignore':
             return self.emptychar, (self.emptytoken,)
         elif self.errors == 'replace':
             # use the \\char command
             # this assumes
             # \usepackage[T1]{fontenc}
             # \usepackage[utf8]{inputenc}
             if self.binary_mode:
                 bytes_ = b'{\\char' + str(ord(c)).encode("ascii") + b'}'
             else:
                 bytes_ = u'{\\char' + str(ord(c)) + u'}'
             return bytes_, (lexer.Token(name='chars', text=bytes_),)
         elif self.errors == 'keep' and not self.binary_mode:
             return c,  (lexer.Token(name='chars', text=c),)
         else:
             raise ValueError(
                 "latex codec does not support {0} errors"
                 .format(self.errors))
Exemplo n.º 3
0
class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder):
    """Translating incremental encoder for latex. Maintains a state to
    determine whether control spaces etc. need to be inserted.
    """

    emptytoken = lexer.Token(u"unknown", u"")
    """The empty token."""

    table = _LATEX_UNICODE_TABLE
    """Translation table."""
    def __init__(self, errors='strict'):
        super(LatexIncrementalEncoder, self).__init__(errors=errors)
        self.reset()

    def reset(self):
        super(LatexIncrementalEncoder, self).reset()
        self.state = 'M'

    def get_space_bytes(self, bytes_):
        """Inserts space bytes in space eating mode."""
        if self.state == 'S':
            # in space eating mode
            # control space needed?
            if bytes_.startswith(u' '):
                # replace by control space
                return u'\\ ', bytes_[1:]
            else:
                # insert space (it is eaten, but needed for separation)
                return u' ', bytes_
        else:
            return u'', bytes_

    def _get_latex_chars_tokens_from_char(self, c):
        # if ascii, try latex equivalents
        # (this covers \, #, &, and other special LaTeX characters)
        if ord(c) < 128:
            try:
                return self.table.latex_map[c]
            except KeyError:
                pass
        # next, try input encoding
        try:
            bytes_ = c.encode(self.inputenc, 'strict')
        except UnicodeEncodeError:
            pass
        else:
            return c, (lexer.Token(name=u'chars', text=c), )
        # next, try latex equivalents of common unicode characters
        try:
            return self.table.latex_map[c]
        except KeyError:
            # translation failed
            if self.errors == 'strict':
                raise UnicodeEncodeError(
                    "latex",  # codec
                    c,  # problematic input
                    0,
                    1,  # location of problematic character
                    "don't know how to translate {0} into latex".format(
                        repr(c)))
            elif self.errors == 'ignore':
                return u'', (self.emptytoken, )
            elif self.errors == 'replace':
                # use the \\char command
                # this assumes
                # \usepackage[T1]{fontenc}
                # \usepackage[utf8]{inputenc}
                bytes_ = u'{\\char' + str(ord(c)) + u'}'
                return bytes_, (lexer.Token(name=u'chars', text=bytes_), )
            elif self.errors == 'keep' and not self.binary_mode:
                return c, (lexer.Token(name=u'chars', text=c), )
            else:
                raise ValueError(
                    "latex codec does not support {0} errors".format(
                        self.errors))

    def get_latex_chars(self, unicode_, final=False):
        if not isinstance(unicode_, string_types):
            raise TypeError(
                "expected unicode for encode input, but got {0} instead".
                format(unicode_.__class__.__name__))
        # convert character by character
        for pos, c in enumerate(unicode_):
            bytes_, tokens = self._get_latex_chars_tokens_from_char(c)
            space, bytes_ = self.get_space_bytes(bytes_)
            # update state
            if tokens[-1].name == u'control_word':
                # we're eating spaces
                self.state = 'S'
            else:
                self.state = 'M'
            if space:
                yield space
            yield bytes_