def test_toUnicode_UTF16(self): """Tests converting and parsing an utf-16 encoded string.""" #====================================================================== # with BOM #====================================================================== temp = codecs.BOM_UTF16_LE +\ unicode(TEST_W_ENC % 'UTF-16').encode('UTF-16le') # with cutting declaration data, enc = toUnicode(temp, True) assert enc == 'utf-16le' assert data == TEST_BASE # without cutting declaration data, enc = toUnicode(temp, False) assert enc == 'utf-16le' assert data == TEST_W_ENC % 'UTF-16' temp = codecs.BOM_UTF16_BE +\ unicode(TEST_W_ENC % 'UTF-16').encode('UTF-16be') # with cutting declaration data, enc = toUnicode(temp, True) assert enc == 'utf-16be' assert data == TEST_BASE # without cutting declaration data, enc = toUnicode(temp, False) assert enc == 'utf-16be' assert data == TEST_W_ENC % 'UTF-16' #====================================================================== # without BOM #====================================================================== # this case can not easily be handled, utf-8 is expected if no bom is # found and declaration is not readable temp = unicode(TEST_W_ENC % 'UTF-16').encode('UTF-16le') data, enc = toUnicode(temp, True) assert enc == 'utf-8'
def test_toUnicode_utf8(self): """Tests converting and parsing an utf-8 encoded string.""" #====================================================================== # with BOM #====================================================================== # with encoding temp = codecs.BOM_UTF8 + unicode(TEST_W_ENC % 'UTF-8').encode('UTF-8') # with cutting declaration data, enc = toUnicode(temp, True) assert data == TEST_BASE assert enc == 'utf-8' # without cutting declaration data, enc = toUnicode(temp, False) assert data == TEST_W_ENC % 'UTF-8' assert enc == 'utf-8' # without encoding temp = codecs.BOM_UTF8 + unicode(TEST_WO_ENC).encode('UTF-8') # with cutting declaration data, enc = toUnicode(temp, True) assert data == TEST_BASE assert enc == 'utf-8' # without cutting declaration data, enc = toUnicode(temp, False) assert data == TEST_WO_ENC assert enc == 'utf-8' #====================================================================== # without BOM #====================================================================== # witch encoding temp = unicode(TEST_W_ENC % 'UTF-8').encode('UTF-8') # with cutting declaration data, enc = parseXMLDeclaration(temp, True) assert data == TEST_BASE assert enc == 'utf-8' # without cutting declaration data, enc = parseXMLDeclaration(temp, False) assert data == TEST_W_ENC % 'UTF-8' assert enc == 'utf-8' # without encoding temp = unicode(TEST_WO_ENC).encode('UTF-8') # with cutting declaration data, enc = parseXMLDeclaration(temp, True) assert data == TEST_BASE assert enc == 'utf-8' # without cutting declaration data, enc = parseXMLDeclaration(temp, False) assert data == TEST_WO_ENC assert enc == 'utf-8'
def newXMLDocument(data, id=None, uid=None): """ Returns a new XmlDocument object. Data will be converted to unicode and a possible XML declaration will be removed. Use this method whenever you wish to create a XmlDocument manually! """ # check for data if len(data) == 0: raise InvalidParameterError("XML document is empty.") # convert data to unicode and remove XML declaration if isinstance(data, unicode): data, _ = parseXMLDeclaration(data, remove_decl=True) else: data, _ = toUnicode(data, remove_decl=True) return XmlDocument(data, id, uid)