def _ucp_width(ucs, control_chars='guess'): '''Get the textual width of a ucs character. :arg ucs: integer representing a single unicode :term:`code point` :kwarg control_chars: specify how to deal with control chars. Possible values are: :guess: (default) will take a guess for control code widths. Most codes will return 0 width. backspace, delete, and clear delete return -1. escape currently returns -1 as well but this is not guaranteed as it's not always correct :strict: will raise :exc:`~kitchen.text.exceptions.ControlCharError` if a control code is encountered :raises ControlCharError: if the :term:`code point` is a unicode control character and :attr:`control_chars` is set to 'strict' :returns: :term:`textual width` of the character. .. note: It's important to remember this is :term:`textual width` and not the number of characters or bytes. ''' # test for 8-bit control characters if ucs < 32 or (ucs < 0xa0 and ucs >= 0x7f): # Control character detected if control_chars == 'strict': raise ControlCharError( _('_ucp_width does not understand how to' ' assign a width value to control characters.')) if ucs in (0x08, 0x07F, 0x94): # Backspace, delete, and clear delete remove a single character return -1 if ucs == 0x1b: # Excape is tricky. It removes some number of characters that # come after it but the amount is dependent on what is # interpreting the code. # So this is going to often be wrong but other values will be # wrong as well. return -1 # All other control characters get 0 width return 0 if _interval_bisearch(ucs, _COMBINING): # Combining characters return 0 width as they will be combined with # the width from other characters return 0 # if we arrive here, ucs is not a combining or C0/C1 control character return (1 + ( ucs >= 0x1100 and (ucs <= 0x115f or # Hangul Jamo init. consonants ucs == 0x2329 or ucs == 0x232a or (ucs >= 0x2e80 and ucs <= 0xa4cf and ucs != 0x303f) or # CJK ... Yi (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms (ucs >= 0xffe0 and ucs <= 0xffe6) or (ucs >= 0x20000 and ucs <= 0x2fffd) or (ucs >= 0x30000 and ucs <= 0x3fffd))))
def process_control_chars(string, strategy='replace'): '''Look for and transform :term:`control characters` in a string :arg string: string to search for and transform :term:`control characters` within :kwarg strategy: XML does not allow :term:`ASCII` :term:`control characters`. When we encounter those we need to know what to do. Valid options are: :replace: (default) Replace the :term:`control characters` with ``"?"`` :ignore: Remove the characters altogether from the output :strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when we encounter a control character :raises TypeError: if :attr:`string` is not a unicode string. :raises ValueError: if the strategy is not one of replace, ignore, or strict. :raises kitchen.text.exceptions.ControlCharError: if the strategy is ``strict`` and a :term:`control character` is present in the :attr:`string` :returns: :class:`str` string with no :term:`control characters` in it. .. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0 Strip out the C1 control characters in addition to the C0 control characters. ''' if not isunicodestring(string): raise TypeError('process_control_char must have a unicode type' ' (str) as the first argument.') if strategy not in ('replace', 'ignore', 'strict'): raise ValueError('The strategy argument to process_control_chars' ' must be one of ignore, replace, or strict') # Most strings don't have control chars and translating carries # a higher cost than testing whether the chars are in the string # So only translate if necessary if not _CONTROL_CHARS.isdisjoint(string): if strategy == 'replace': control_table = _REPLACE_TABLE elif strategy == 'ignore': control_table = _IGNORE_TABLE else: # strategy can only equal 'strict' raise ControlCharError('ASCII control code present in string' ' input') string = string.translate(control_table) return string
def process_control_chars(string, strategy='replace'): '''Look for and transform :term:`control characters` in a string :arg string: string to search for and transform :term:`control characters` within :kwarg strategy: XML does not allow :term:`ASCII` :term:`control characters`. When we encounter those we need to know what to do. Valid options are: :replace: (default) Replace the :term:`control characters` with ``"?"`` :ignore: Remove the characters altogether from the output :strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when we encounter a control character :raises TypeError: if :attr:`string` is not a unicode string. :raises ValueError: if the strategy is not one of replace, ignore, or strict. :raises kitchen.text.exceptions.ControlCharError: if the strategy is ``strict`` and a :term:`control character` is present in the :attr:`string` :returns: :class:`unicode` string with no :term:`control characters` in it. ''' if not isinstance(string, unicode): raise TypeError( k.b_('process_control_char must have a unicode type as' ' the first argument.')) if strategy == 'ignore': control_table = dict(zip(_CONTROL_CODES, [None] * len(_CONTROL_CODES))) elif strategy == 'replace': control_table = dict(zip(_CONTROL_CODES, [u'?'] * len(_CONTROL_CODES))) elif strategy == 'strict': control_table = None # Test that there are no control codes present data = frozenset(string) if [c for c in _CONTROL_CHARS if c in data]: raise ControlCharError( k.b_('ASCII control code present in string' ' input')) else: raise ValueError( k.b_('The strategy argument to process_control_chars' ' must be one of ignore, replace, or strict')) if control_table: string = string.translate(control_table) return string
def process_control_chars(string, strategy='replace'): '''Look for and transform control characters in a string :arg string: string to search for and transform control characters in :kwarg strategy: XML does not allow ASCII control characters. When we encounter those we need to know what to do. Valid options are: :replace: (default) Replace the control characters with "?" :ignore: Remove the characters altogether from the output :strict: Raise an error when we encounter a control character :raises TypeError: if :attr:`string` is not a unicode string. :raises ValueError: if the strategy is not one of replace, ignore, or strict. :returns: unicode string with no control characters in it. ''' if not isinstance(string, unicode): raise TypeError( _('process_control_char must have a unicode type as' ' the first argument.')) if strategy == 'ignore': control_table = dict(zip(_control_codes, [None] * len(_control_codes))) elif strategy == 'replace': control_table = dict(zip(_control_codes, [u'?'] * len(_control_codes))) elif strategy == 'strict': control_table = None # Test that there are no control codes present data = frozenset(string) if [c for c in _control_chars if c in data]: raise ControlCharError( _('ASCII control code present in string' ' input')) else: raise ValueError( _('The strategy argument to process_control_chars' ' must be one of ignore, replace, or strict')) if control_table: string = string.translate(control_table) return string