def test_newline_bytes(self):
     import _io
     # Issue 5433: Excessive optimization in IncrementalNewlineDecoder
     def _check(dec):
         assert dec.newlines is None
         assert dec.decode("\u0D00") == "\u0D00"
         assert dec.newlines is None
         assert dec.decode("\u0A00") == "\u0A00"
         assert dec.newlines is None
     dec = _io.IncrementalNewlineDecoder(None, translate=False)
     _check(dec)
     dec = _io.IncrementalNewlineDecoder(None, translate=True)
     _check(dec)
示例#2
0
def test_newlines2():
    inner_decoder = codecs.getincrementaldecoder("utf-8")()
    decoder = _io.IncrementalNewlineDecoder(inner_decoder, translate=True)
    msg = b"abc\r\n\n\r\r\n\n"
    decoded = ''
    for ch in msg:
        decoded += decoder.decode(bytes([ch]))
    assert set(decoder.newlines) == {"\r", "\n", "\r\n"}
示例#3
0
def decode_source(source_bytes):
    # copied from _bootstrap_external.py
    """Decode bytes representing source code and return the string.
    Universal newline support is used in the decoding.
    """
    import _io
    import tokenize  # To avoid bootstrap issues.
    source_bytes_readline = _io.BytesIO(source_bytes).readline
    encoding = tokenize.detect_encoding(source_bytes_readline)
    newline_decoder = _io.IncrementalNewlineDecoder(None, True)
    return newline_decoder.decode(source_bytes.decode(encoding[0]))
示例#4
0
    def test_newline_decoder(self):
        import _io

        def check_newline_decoding_utf8(decoder):
            # UTF-8 specific tests for a newline decoder
            def _check_decode(b, s, **kwargs):
                # We exercise getstate() / setstate() as well as decode()
                state = decoder.getstate()
                assert decoder.decode(b, **kwargs) == s
                decoder.setstate(state)
                assert decoder.decode(b, **kwargs) == s

            _check_decode(b'\xe8\xa2\x88', u"\u8888")

            _check_decode(b'\xe8', "")
            _check_decode(b'\xa2', "")
            _check_decode(b'\x88', u"\u8888")

            _check_decode(b'\xe8', "")
            _check_decode(b'\xa2', "")
            _check_decode(b'\x88', u"\u8888")

            _check_decode(b'\xe8', "")
            raises(UnicodeDecodeError, decoder.decode, b'', final=True)

            decoder.reset()
            _check_decode(b'\n', "\n")
            _check_decode(b'\r', "")
            _check_decode(b'', "\n", final=True)
            _check_decode(b'\r', "\n", final=True)

            _check_decode(b'\r', "")
            _check_decode(b'a', "\na")

            _check_decode(b'\r\r\n', "\n\n")
            _check_decode(b'\r', "")
            _check_decode(b'\r', "\n")
            _check_decode(b'\na', "\na")

            _check_decode(b'\xe8\xa2\x88\r\n', u"\u8888\n")
            _check_decode(b'\xe8\xa2\x88', u"\u8888")
            _check_decode(b'\n', "\n")
            _check_decode(b'\xe8\xa2\x88\r', u"\u8888")
            _check_decode(b'\n', "\n")

        def check_newline_decoding(decoder, encoding):
            result = []
            if encoding is not None:
                encoder = codecs.getincrementalencoder(encoding)()

                def _decode_bytewise(s):
                    # Decode one byte at a time
                    for b in encoder.encode(s):
                        result.append(decoder.decode(b))
            else:
                encoder = None

                def _decode_bytewise(s):
                    # Decode one char at a time
                    for c in s:
                        result.append(decoder.decode(c))

            assert decoder.newlines == None
            _decode_bytewise(u"abc\n\r")
            assert decoder.newlines == '\n'
            _decode_bytewise(u"\nabc")
            assert decoder.newlines == ('\n', '\r\n')
            _decode_bytewise(u"abc\r")
            assert decoder.newlines == ('\n', '\r\n')
            _decode_bytewise(u"abc")
            assert decoder.newlines == ('\r', '\n', '\r\n')
            _decode_bytewise(u"abc\r")
            assert "".join(result) == "abc\n\nabcabc\nabcabc"
            decoder.reset()
            input = u"abc"
            if encoder is not None:
                encoder.reset()
                input = encoder.encode(input)
            assert decoder.decode(input) == "abc"
            assert decoder.newlines is None

        encodings = (
            # None meaning the IncrementalNewlineDecoder takes unicode input
            # rather than bytes input
            None,
            'utf-8',
            'latin-1',
            'utf-16',
            'utf-16-le',
            'utf-16-be',
            'utf-32',
            'utf-32-le',
            'utf-32-be',
        )
        import codecs
        for enc in encodings:
            decoder = enc and codecs.getincrementaldecoder(enc)()
            decoder = _io.IncrementalNewlineDecoder(decoder, translate=True)
            check_newline_decoding(decoder, enc)
        decoder = codecs.getincrementaldecoder("utf-8")()
        decoder = _io.IncrementalNewlineDecoder(decoder, translate=True)
        check_newline_decoding_utf8(decoder)
示例#5
0
 def test_cr_not_ignored2(self):
     d = _io.IncrementalNewlineDecoder(None, translate=False)
     d.decode("h\n\r")
     d.decode("\n")
     self.assertEqual(('\n', '\r\n'), d.newlines)
示例#6
0
 def test_cr_not_ignored(self):
     d = _io.IncrementalNewlineDecoder(None, translate=False)
     d.decode("h\rello")
     self.assertEqual('\r', d.newlines)