def test_newline_decoder(self): import codecs decoder = codecs.getincrementaldecoder("utf-8")() decoder = io.IncrementalNewlineDecoder(decoder, translate=True) self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), u"\u8888") self.assertEquals(decoder.decode(b'\xe8'), u"") self.assertEquals(decoder.decode(b'\xa2'), u"") self.assertEquals(decoder.decode(b'\x88'), u"\u8888") self.assertEquals(decoder.decode(b'\xe8'), u"") self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True) decoder.setstate((b'', 0)) self.assertEquals(decoder.decode(b'\n'), u"\n") self.assertEquals(decoder.decode(b'\r'), u"") self.assertEquals(decoder.decode(b'', final=True), u"\n") self.assertEquals(decoder.decode(b'\r', final=True), u"\n") self.assertEquals(decoder.decode(b'\r'), u"") self.assertEquals(decoder.decode(b'a'), u"\na") self.assertEquals(decoder.decode(b'\r\r\n'), u"\n\n") self.assertEquals(decoder.decode(b'\r'), u"") self.assertEquals(decoder.decode(b'\r'), u"\n") self.assertEquals(decoder.decode(b'\na'), u"\na") self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), u"\u8888\n") self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), u"\u8888") self.assertEquals(decoder.decode(b'\n'), u"\n") self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), u"\u8888") self.assertEquals(decoder.decode(b'\n'), u"\n") decoder = codecs.getincrementaldecoder("utf-8")() decoder = io.IncrementalNewlineDecoder(decoder, translate=True) self.assertEquals(decoder.newlines, None) decoder.decode(b"abc\n\r") self.assertEquals(decoder.newlines, u'\n') decoder.decode(b"\nabc") self.assertEquals(decoder.newlines, ('\n', '\r\n')) decoder.decode(b"abc\r") self.assertEquals(decoder.newlines, ('\n', '\r\n')) decoder.decode(b"abc") self.assertEquals(decoder.newlines, ('\r', '\n', '\r\n')) decoder.decode(b"abc\r") decoder.reset() self.assertEquals(decoder.decode(b"abc"), "abc") self.assertEquals(decoder.newlines, None)
def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True): """Converts a bytes string with python source code to unicode. Unicode strings are passed through unchanged. Byte strings are checked for the python source file encoding cookie to determine encoding. txt can be either a bytes buffer or a string containing the source code. """ if isinstance(txt, six.text_type): return txt if isinstance(txt, six.binary_type): buffer = io.BytesIO(txt) else: buffer = txt try: encoding, _ = detect_encoding(buffer.readline) except SyntaxError: encoding = "ascii" buffer.seek(0) newline_decoder = io.IncrementalNewlineDecoder(None, True) text = io.TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True) text.mode = 'r' if skip_encoding_cookie: return u"".join(strip_encoding_cookie(text)) else: return text.read()
def get_source(self, fullname): """Concrete implementation of InspectLoader.get_source.""" path = self.get_filename(fullname) try: source_bytes = self.get_data(path) except IOError: raise ImportError("source not available through get_data()") if py3k: import io, tokenize readsource = io.BytesIO(source_bytes).readline try: encoding = tokenize.detect_encoding(readsource) except SyntaxError as exc: raise ImportError("Failed to detect encoding") newline_decoder = io.IncrementalNewlineDecoder(None, True) try: return newline_decoder.decode(source_bytes.decode(encoding[0])) except UnicodeDecodeError as exc: raise ImportError("Failed to decode source file") else: return source_bytes # XXX proper encoding
def test_universal_newlines(self): name = 'mod' mock = self.SourceOnlyLoaderMock('mod.file') source = 'x = 42\r\ny = -13\r\n' mock.source = source.encode('utf-8') expect = io.IncrementalNewlineDecoder(None, True).decode(source) self.assertEqual(mock.get_source(name), expect)
def test_universal_newlines(self): # PEP 302 says universal newlines should be used. name = 'mod' mock = SourceOnlyLoaderMock('mod.file') source = "x = 42\r\ny = -13\r\n" mock.source = source.encode('utf-8') expect = io.IncrementalNewlineDecoder(None, True).decode(source) self.assertEqual(mock.get_source(name), expect)
def decode_source(source_bytes): """Decode bytes representing source code and return the string. Universal newline support is used in the decoding. """ # source_bytes_readline = io.BytesIO(source_bytes).readline # encoding, _ = detect_encoding(source_bytes_readline) newline_decoder = io.IncrementalNewlineDecoder(None, True) return newline_decoder.decode(source_to_unicode(source_bytes))
def test_newline_decoder(self): encodings = ( 'utf-8', 'latin-1', 'utf-16', 'utf-16-le', 'utf-16-be', 'utf-32', 'utf-32-le', 'utf-32-be', ) for enc in encodings: decoder = codecs.getincrementaldecoder(enc)() decoder = io.IncrementalNewlineDecoder(decoder, translate=True) self.check_newline_decoder(decoder, enc) decoder = codecs.getincrementaldecoder("utf-8")() decoder = io.IncrementalNewlineDecoder(decoder, translate=True) self.check_newline_decoder_utf8(decoder)
def decode_source(source_bytes: bytes) -> str: """Copied from importlib._bootstrap_external""" source_bytes_readline = io.BytesIO(source_bytes).readline encoding = tokenize.detect_encoding(source_bytes_readline) newline_decoder = io.IncrementalNewlineDecoder(None, True) return newline_decoder.decode(source_bytes.decode(encoding[0]))