def get_text(string, start, end, bom=True): """This method correctly accesses slices of strings using character start/end offsets referring to UTF-16 encoded bytes. This allows for using character offsets generated by Rosette (and other softwares) that use UTF-16 native string representations under Pythons with UCS-4 support, such as Python 3.3+ (refer to https://www.python.org/dev/peps/pep-0393/). The offsets are adjusted to account for a UTF-16 byte order mark (BOM) (2 bytes) and also that each UTF-16 logical character consumes 2 bytes. 'character' in this context refers to logical characters for the purpose of character offsets; an individual character can consume up to 4 bytes (32 bits for so-called 'wide' characters) and graphemes can consume even more. """ import codecs if not isinstance(string, str): raise ValueError('expected string to be of type str') if not any(((start is None), isinstance(start, int))): raise ValueError('expected start to be of type int or NoneType') if not any(((end is None), isinstance(end, int))): raise ValueError('expected end to be of type int or NoneType') if start is not None: start *= 2 if bom: start += 2 if end is not None: end *= 2 if bom: end += 2 utf_16, _ = codecs.utf_16_encode(string) sliced, _ = codecs.utf_16_decode(utf_16[start:end]) return sliced
def encode(self, input, errors='strict'): self.bom_written = True result = codecs.utf_16_encode(input, errors) if sys.byteorder == 'little': self.encode = codecs.utf_16_le_encode else: self.encode = codecs.utf_16_be_encode return result
def validate(s): correct_utf16 = codecs.utf_16_encode(s)[0][2:] utf8 = codecs.utf_8_encode(s) p = subprocess.Popen("./utf16", stdin = subprocess.PIPE, stdout = subprocess.PIPE) maybe_utf16 = p.communicate(utf8[0])[0] if correct_utf16 != maybe_utf16: print u"tried to do %r, got back %r, expected %r" % (utf8[0], maybe_utf16, correct_utf16) raise "failed"
def encode(self, input, final=False): if self.encoder is None: result = codecs.utf_16_encode(input, self.errors)[0] if sys.byteorder == 'little': self.encoder = codecs.utf_16_le_encode else: self.encoder = codecs.utf_16_be_encode return result return self.encoder(input, self.errors)[0]
def encode(self, input, errors='strict'): if self.encoder is None: result = codecs.utf_16_encode(input, errors) if sys.byteorder == 'little': self.encoder = codecs.utf_16_le_encode else: self.encoder = codecs.utf_16_be_encode return result return self.encoder(input, errors)
def encode(self, input, errors='strict'): if self.encoder is None: result = codecs.utf_16_encode(input, errors) if sys.byteorder == 'little': self.encoder = codecs.utf_16_le_encode else: self.encoder = codecs.utf_16_be_encode return result else: return self.encoder(input, errors)
def validate(s): correct_utf16 = codecs.utf_16_encode(s)[0][2:] utf8 = codecs.utf_8_encode(s) p = subprocess.Popen("./utf16", stdin=subprocess.PIPE, stdout=subprocess.PIPE) maybe_utf16 = p.communicate(utf8[0])[0] if correct_utf16 != maybe_utf16: print u"tried to do %r, got back %r, expected %r" % ( utf8[0], maybe_utf16, correct_utf16) raise "failed"
def test_codecs_builtins(self): s = "abc" encoded = codecs.utf_8_encode(s) self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0]) encoded = codecs.utf_7_encode(s) self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0]) encoded = codecs.utf_16_encode(s) self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0]) encoded = codecs.utf_16_le_encode(s) self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0]) encoded = codecs.utf_16_be_encode(s) self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0]) encoded = codecs.utf_32_encode(s) self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0]) encoded = codecs.utf_32_le_encode(s) self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.raw_unicode_escape_encode(s) self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0]) encoded = codecs.unicode_escape_encode(s) self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0]) encoded = codecs.latin_1_encode(s) self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0]) encoded = codecs.ascii_encode(s) self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
def test_utf_16_encode(self): #Sanity self.assertEqual(codecs.utf_16_encode("abc"), ('\xff\xfea\x00b\x00c\x00', 3))
def test_utf_16_encode(self): # On little-endian systems, UTF-16 encodes in UTF-16-LE prefixed with BOM data, num_processed = codecs.utf_16_encode("abc") self.assertEqual(data, codecs.BOM_UTF16 + b'a\0b\0c\0') self.assertEqual(num_processed, 3)
def update_event(self, inp=-1): self.set_output_val( 0, codecs.utf_16_encode(self.input(0), self.input(1), self.input(2)))