Пример #1
0
 def test_html_entities_unescape(self):
     tools.assert_raises(TypeError, misc.html_entities_unescape, b'byte string')
     tools.ok_(misc.html_entities_unescape(self.u_entity_escape) == self.u_entity)
     tools.ok_(misc.html_entities_unescape('<tag>%s</tag>'
         % self.u_entity_escape) == self.u_entity)
     tools.ok_(misc.html_entities_unescape('a&#1234567890;b') == 'a&#1234567890;b')
     tools.ok_(misc.html_entities_unescape('a&#xfffd;b') == 'a\ufffdb')
     tools.ok_(misc.html_entities_unescape('a&#65533;b') == 'a\ufffdb')
Пример #2
0
 def test_html_entities_unescape(self):
     tools.assert_raises(TypeError, misc.html_entities_unescape,
                         'byte string')
     tools.ok_(
         misc.html_entities_unescape(self.u_entity_escape) == self.u_entity)
     tools.ok_(
         misc.html_entities_unescape(u'<tag>%s</tag>' %
                                     self.u_entity_escape) == self.u_entity)
     tools.ok_(
         misc.html_entities_unescape(u'a&#1234567890;b') ==
         u'a&#1234567890;b')
     tools.ok_(misc.html_entities_unescape(u'a&#xfffd;b') == u'a\ufffdb')
     tools.ok_(misc.html_entities_unescape(u'a&#65533;b') == u'a\ufffdb')
Пример #3
0
def xml_to_unicode(byte_string, encoding='utf8', errors='replace'):
    '''Transform a byte string from an xml file into unicode

    :arg byte_string: byte string to decode
    :kwarg encoding: encoding that the byte string is in
    :kwarg errors: What to do if not every character is decodable using
        :attr:`encoding`.  See the :func:`to_unicode` docstring for
        legal values.
    :returns: unicode string decoded from :attr:`byte_string`

    This function attempts to reverse what :func:`unicode_to_xml` does.
    It takes a byte string (presumably read in from an xml file) and expands
    all the html entities into unicode characters and decodes the bytes into
    a unicode string.  One thing it cannot do is restore any control
    characters that were removed prior to inserting into the file.  If you
    need to keep such characters you need to use func:`xml_to_bytes` and
    :func:`bytes_to_xml` instead.
    '''
    string = to_unicode(byte_string, encoding=encoding, errors=errors)
    string = html_entities_unescape(string)
    return string
Пример #4
0
def xml_to_unicode(byte_string, encoding='utf-8', errors='replace'):
    '''Transform a byte :class:`bytes` from an xml file into a :class:`str`
    string

    :arg byte_string: byte :class:`bytes` to decode
    :kwarg encoding: encoding that the byte :class:`bytes` is in
    :kwarg errors: What to do if not every character is  valid in
        :attr:`encoding`.  See the :func:`to_unicode` documentation for legal
        values.
    :rtype: :class:`str` string
    :returns: string decoded from :attr:`byte_string`

    This function attempts to reverse what :func:`unicode_to_xml` does.  It
    takes a byte :class:`bytes` (presumably read in from an xml file) and
    expands all the html entities into unicode characters and decodes the byte
    :class:`bytes` into a :class:`str` string.  One thing it cannot do is
    restore any :term:`control characters` that were removed prior to
    inserting into the file.  If you need to keep such characters you need to
    use :func:`xml_to_bytes` and :func:`bytes_to_xml` or use on of the
    strategies documented in :func:`unicode_to_xml` instead.
    '''
    string = to_unicode(byte_string, encoding=encoding, errors=errors)
    string = html_entities_unescape(string)
    return string