def test_guess_encoding_no_chardet(self): # Test that unicode strings are not allowed tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish) tools.ok_(misc.guess_encoding(self.utf8_spanish, disable_chardet=True) == 'utf-8') tools.ok_(misc.guess_encoding(self.latin1_spanish, disable_chardet=True) == 'latin-1') tools.ok_(misc.guess_encoding(self.utf8_japanese, disable_chardet=True) == 'utf-8') tools.ok_(misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) == 'latin-1')
def test_guess_encoding_with_chardet(self): # We go this slightly roundabout way because multiple encodings can # output the same byte sequence. What we're really interested in is # if we can get the original unicode string without knowing the # converters beforehand tools.ok_(to_unicode(self.utf8_spanish, misc.guess_encoding(self.utf8_spanish)) == self.u_spanish) tools.ok_(to_unicode(self.latin1_spanish, misc.guess_encoding(self.latin1_spanish)) == self.u_spanish) tools.ok_(to_unicode(self.utf8_japanese, misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)
def test_guess_encoding_with_chardet(self): # We go this slightly roundabout way because multiple encodings can # output the same byte sequence. What we're really interested in is # if we can get the original unicode string without knowing the # converters beforehand tools.ok_( to_unicode(self.utf8_spanish, misc.guess_encoding( self.utf8_spanish)) == self.u_spanish) tools.ok_( to_unicode(self.latin1_spanish, misc.guess_encoding(self.latin1_spanish)) == self.u_spanish) tools.ok_( to_unicode(self.utf8_japanese, misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)
def test_guess_encoding_no_chardet(self): # Test that unicode strings are not allowed tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish) tools.ok_( misc.guess_encoding(self.utf8_spanish, disable_chardet=True) == 'utf-8') tools.ok_( misc.guess_encoding(self.latin1_spanish, disable_chardet=True) == 'latin-1') tools.ok_( misc.guess_encoding(self.utf8_japanese, disable_chardet=True) == 'utf-8') tools.ok_( misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) == 'latin-1')
def test_guess_encoding_with_chardet_uninstalled(self): if chardet: raise SkipTest('chardet installed, euc_jp will not be mangled') else: tools.ok_(to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_mangled_euc_jp_as_latin1)
def guess_encoding_to_xml(string, output_encoding='utf-8', attrib=False, control_chars='replace'): '''Return a byte :class:`bytes` suitable for inclusion in xml :arg string: :class:`str` or byte :class:`bytes` to be transformed into a byte :class:`bytes` suitable for inclusion in xml. If string is a byte :class:`bytes` we attempt to guess the encoding. If we cannot guess, we fallback to ``latin-1``. :kwarg output_encoding: Output encoding for the byte :class:`bytes`. This should match the encoding of your xml file. :kwarg attrib: If :data:`True`, escape the item for use in an xml attribute. If :data:`False` (default) escape the item for use in a text node. :returns: :term:`utf-8` encoded byte :class:`bytes` ''' # Unicode strings can just be run through unicode_to_xml() if isunicodestring(string): return unicode_to_xml(string, encoding=output_encoding, attrib=attrib, control_chars=control_chars) # Guess the encoding of the byte strings input_encoding = guess_encoding(string) # Return the new byte string return byte_string_to_xml(string, input_encoding=input_encoding, errors='replace', output_encoding=output_encoding, attrib=attrib, control_chars=control_chars)
def guess_encoding_to_xml(string, output_encoding='utf8', attrib=False, control_chars='replace'): '''Return a byte string suitable for inclusion in xml :arg string: unicode or byte string to be transformed into a byte string suitable for inclusion in xml. If string is a byte string we attempt to guess the encoding. If we cannot guess, we fallback to latin1. :kwarg output_encoding: Output encoding for the byte string. This should match the encoding of your xml file. :kwarg attrib: If True, escape the item for use in an attribute. If False default) escape the item for use in a text node. :returns: utf8 encoded byte string ''' # Unicode strings can just be run through unicode_to_xml() if isinstance(string, unicode): return unicode_to_xml(string, encoding=output_encoding, attrib=attrib, control_chars=control_chars) # Guess the encoding of the byte strings input_encoding = guess_encoding(string) # Return the new byte string return byte_string_to_xml(string, input_encoding=input_encoding, errors='replace', output_encoding=output_encoding, attrib=attrib, control_chars=control_chars)
def test_guess_encoding_with_chardet_uninstalled(self): if chardet: raise SkipTest('chardet installed, euc_jp will not be mangled') else: tools.ok_( to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_mangled_euc_jp_as_latin1)
def test_guess_encoding_with_chardet_installed(self): if chardet: tools.ok_( to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese) else: raise SkipTest( 'chardet not installed, euc_jp will not be guessed correctly')
def test_guess_encoding_with_chardet_installed(self): if chardet: tools.ok_(to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese) else: raise SkipTest('chardet not installed, euc_jp will not be guessed correctly')