Exemplo n.º 1
0
    def test_detect_encoding(self):
        mojibake = b'\x95\xb6\x8e\x9a\x89\xbb\x82\xaf'
        krakozyabry = b'\xeb\xd2\xc1\xcb\xcf\xda\xd1\xc2\xd2\xd9'

        self.assertEqual('shift_jis', detect_encoding(mojibake, 'shift_jis'))
        self.assertEqual('koi8-r', detect_encoding(krakozyabry, 'koi8-r'))

        self.assertEqual('iso8859-1', detect_encoding(b'\xff\xff\xff\x81'))

        self.assertRaises(ValueError,
                          detect_encoding,
                          b'\xff\xff\xff\x81',
                          'utf8',
                          fallback=())

        self.assertEqual(
            'ascii',
            detect_encoding(b'<html><meta charset="dog_breath"><body>',
                            is_html=True))

        self.assertEqual(
            'ascii',
            detect_encoding(
                b'<html><meta content="text/html; charset=cat-meows><body>',
                is_html=True))

        for length in range(1, 2):
            iterable = itertools.permutations([bytes(i) for i in range(256)],
                                              length)
            for data in iterable:
                detect_encoding(b''.join(data))
Exemplo n.º 2
0
    def test_detect_encoding(self):
        mojibake = b'\x95\xb6\x8e\x9a\x89\xbb\x82\xaf'
        krakozyabry = b'\xeb\xd2\xc1\xcb\xcf\xda\xd1\xc2\xd2\xd9'

        self.assertEqual(
            'shift_jis',
            detect_encoding(mojibake, 'shift_jis')
        )
        self.assertEqual(
            'koi8-r',
            detect_encoding(krakozyabry, 'koi8-r')
        )

        self.assertEqual(
            'iso8859-1',
            detect_encoding(b'\xff\xff\xff\x81')
        )

        self.assertRaises(
            ValueError,
            detect_encoding, b'\xff\xff\xff\x81',
            'utf8', fallback=()
        )

        self.assertEqual(
            'ascii',
            detect_encoding(
                b'<html><meta charset="dog_breath"><body>',
                is_html=True
            )
        )

        self.assertEqual(
            'ascii',
            detect_encoding(
                b'<html><meta content="text/html; charset=cat-meows><body>',
                is_html=True
            )
        )

        for length in range(1, 2):
            iterable = itertools.permutations(
                [bytes(i) for i in range(256)], length
            )
            for data in iterable:
                detect_encoding(b''.join(data))
Exemplo n.º 3
0
    def test_detect_encoding(self):
        mojibake = b'\x95\xb6\x8e\x9a\x89\xbb\x82\xaf'
        krakozyabry = b'\xeb\xd2\xc1\xcb\xcf\xda\xd1\xc2\xd2\xd9'

        self.assertEqual(
            'shift_jis',
            detect_encoding(mojibake, 'shift_jis')
        )
        self.assertEqual(
            'koi8-r',
            detect_encoding(krakozyabry, 'koi8-r')
        )

        self.assertEqual(
            'shift_jis',
            detect_encoding((mojibake * 10)[:-1], 'shift_jis')
        )
        self.assertEqual(
            'koi8-r',
            detect_encoding((krakozyabry * 10)[:-1], 'koi8-r')
        )

        self.assertEqual(
            'iso8859-1',
            detect_encoding(b'\xff\xff\xff\x81')
        )

        self.assertRaises(
            ValueError,
            detect_encoding, b'\xff\xff\xff\x81',
            'utf8', fallback=()
        )

        self.assertEqual(
            'utf-8',
            detect_encoding(
                b'<html><meta charset="dog_breath"><body>',
                is_html=True
            )
        )

        self.assertEqual(
            'utf-8',
            detect_encoding(
                b'<html><meta content="text/html; charset=cat-meows><body>',
                is_html=True
            )
        )

        self.assertEqual(
            'utf-16-le',
            detect_encoding(
                codecs.BOM_UTF16_LE +
                'Let’s hope no one uses UTF-36'.encode('utf_16_le')[:-1]
            )
        )

        # Check for no crash
        detect_encoding(
            b'<?xml version="1.0" encoding="UTF-\xdb" ?>'
        )

        for length in range(1, 2):
            iterable = itertools.permutations(
                [bytes(i) for i in range(256)], length
            )
            for data in iterable:
                detect_encoding(b''.join(data))
Exemplo n.º 4
0
    def test_detect_encoding(self):
        mojibake = b'\x95\xb6\x8e\x9a\x89\xbb\x82\xaf'
        krakozyabry = b'\xeb\xd2\xc1\xcb\xcf\xda\xd1\xc2\xd2\xd9'

        self.assertEqual('shift_jis', detect_encoding(mojibake, 'shift_jis'))
        self.assertEqual('koi8-r', detect_encoding(krakozyabry, 'koi8-r'))

        self.assertEqual('shift_jis',
                         detect_encoding((mojibake * 10)[:-1], 'shift_jis'))
        self.assertEqual('koi8-r',
                         detect_encoding((krakozyabry * 10)[:-1], 'koi8-r'))

        self.assertEqual('iso8859-1', detect_encoding(b'\xff\xff\xff\x81'))

        self.assertRaises(ValueError,
                          detect_encoding,
                          b'\xff\xff\xff\x81',
                          'utf8',
                          fallback=())

        self.assertEqual(
            'utf-8',
            detect_encoding(b'<html><meta charset="dog_breath"><body>',
                            is_html=True))

        self.assertEqual(
            'utf-8',
            detect_encoding(
                b'<html><meta content="text/html; charset=cat-meows><body>',
                is_html=True))

        self.assertEqual(
            'utf-16-le',
            detect_encoding(
                codecs.BOM_UTF16_LE +
                'Let’s hope no one uses UTF-36'.encode('utf_16_le')[:-1]))

        # Check for no crash
        detect_encoding(b'<?xml version="1.0" encoding="UTF-\xdb" ?>')

        for length in range(1, 2):
            iterable = itertools.permutations([bytes(i) for i in range(256)],
                                              length)
            for data in iterable:
                detect_encoding(b''.join(data))