Exemplo n.º 1
0
    def test_complete_gibberish(self):
        self.assertTrue(
            ProbeChaos("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""").gave_up,
        )

        self.assertTrue(
            ProbeChaos("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""").gave_up,
        )
Exemplo n.º 2
0
    def test_part_gibberish(self):

        self.assertGreater(
            ProbeChaos("""[email protected] ุชุฑุฌู…
ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู…
 ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…
ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""",
                       giveup_threshold=0.5).ratio, 0.4)

        self.assertGreater(ProbeChaos("锌褉械锌芯写邪胁邪褌械谢褟屑懈 锌芯褝褌芯 ").ratio, 0.4)
Exemplo n.º 3
0
    def test_not_gibberish(self):

        self.assertLessEqual(
            ProbeChaos(
                '典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。'
            ).ratio, 0.)

        self.assertEqual(
            ProbeChaos('العقلية , التنويم المغناطيسي و / أو الاقتراح').ratio,
            0.)

        self.assertEqual(
            ProbeChaos(
                "RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل").
            ratio, 0.)
Exemplo n.º 4
0
 def chaos_secondary_pass(self):
     """
     Check once again chaos in decoded text, except this time, with full content.
     :return: Same as chaos property expect it's about all content
     :rtype: float
     """
     return ProbeChaos(str(self)).ratio
Exemplo n.º 5
0
    def test_subtle_gibberish(self):

        self.assertLessEqual(
            ProbeChaos("Cehennemin Sava■þ²s²'da kim?").ratio, 0.5)

        self.assertGreaterEqual(
            ProbeChaos("Cehennemin Sava■þ²s²'da kim?").ratio, 0.)

        self.assertGreater(ProbeChaos('´Á¥½³ø§i --  ±i®Ìºû, ³¯·Ø©v').ratio, 0.)

        self.assertLessEqual(
            ProbeChaos("´Á¥½³ø§i --  ±i®Ìºû, ³¯·Ø©v").ratio, 0.5)

        self.assertGreater(
            ProbeChaos(
                "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli"
            ).ratio, 0.)

        self.assertLessEqual(
            ProbeChaos(
                "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli"
            ).ratio, 0.5)

        self.assertLessEqual(
            ProbeChaos(
                "<i>Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.</i>"
            ).ratio, 0.5)
Exemplo n.º 6
0
    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
        """
        Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
        charset encoding.
        Will test input like this (with steps=4 & chunk_size=4) --> [####     ####     ####     ####]
        :param bytes sequences: Actual sequence of bytes to analyse
        :param float threshold: Maximum amount of chaos allowed on first pass
        :param int chunk_size: Size to extract and analyse in each step
        :param int steps: Number of steps
        :return: List of potential matches
        :rtype: CharsetNormalizerMatches
        """
        py_v = [int(el) for el in python_version_tuple()]
        py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)

        supported = sorted(aliases.items()) if py_need_sort else aliases.items()

        tested = set()
        matches = list()

        maximum_length = len(sequences)

        if maximum_length <= chunk_size:
            chunk_size = maximum_length
            steps = 1

        for support in supported:

            k, p = support

            if p in tested:
                continue

            tested.add(p)

            bom_available = False
            bom_len = None

            try:
                if p in BYTE_ORDER_MARK.keys():

                    if isinstance(BYTE_ORDER_MARK[p], bytes) and sequences.startswith(BYTE_ORDER_MARK[p]):
                        bom_available = True
                        bom_len = len(BYTE_ORDER_MARK[p])
                    elif isinstance(BYTE_ORDER_MARK[p], list):
                        bom_c_list = [sequences.startswith(el) for el in BYTE_ORDER_MARK[p]]
                        if any(bom_c_list) is True:
                            bom_available = True
                            bom_len = len(BYTE_ORDER_MARK[p][bom_c_list.index(True)])

                str(
                    sequences if bom_available is False else sequences[bom_len:],
                    encoding=p
                )

            except UnicodeDecodeError:
                continue
            except LookupError:
                continue

            r_ = range(
                0 if bom_available is False else bom_len,
                maximum_length,
                int(maximum_length / steps)
            )

            measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
            ratios = [el.ratio for el in measures]
            nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True)

            chaos_means = statistics.mean(ratios)
            chaos_median = statistics.median(ratios)
            # chaos_min = min(ratios)
            # chaos_max = max(ratios)

            if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
                # print(p, 'is too much chaos for decoded input !')
                continue

            encountered_unicode_range_occurrences = dict()

            for el in measures:
                for u_name, u_occ in el.encountered_unicode_range_occurrences.items():
                    if u_name not in encountered_unicode_range_occurrences.keys():
                        encountered_unicode_range_occurrences[u_name] = 0
                    encountered_unicode_range_occurrences[u_name] += u_occ

            # print(p, 'U RANGES', encountered_unicode_range_occurrences)

            cnm = CharsetNormalizerMatch(
                sequences if not bom_available else sequences[bom_len:],
                p,
                chaos_means,
                encountered_unicode_range_occurrences,
                bom_available
            )

            fingerprint_tests = [el.fingerprint == cnm.fingerprint for el in matches]

            if any(fingerprint_tests) is True:
                matches[fingerprint_tests.index(True)].submatch.append(cnm)
            else:
                matches.append(
                    CharsetNormalizerMatch(
                        sequences if not bom_available else sequences[bom_len:],
                        p,
                        chaos_means,
                        encountered_unicode_range_occurrences,
                        bom_available
                    )
                )

            # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].languages,)

            if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
                return CharsetNormalizerMatches([matches[-1]])

        return CharsetNormalizerMatches(matches)
Exemplo n.º 7
0
    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
        """
        Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
        charset encoding.
        :param bytearray sequences: Actual sequence of bytes to analyse
        :param float threshold: Maximum amount of chaos allowed on first pass
        :param int chunk_size: Size to extract and analyse in each step
        :param int steps: Number of steps
        :return: List of potential matches
        :rtype: CharsetNormalizerMatches
        """
        py_v = [int(el) for el in python_version_tuple()]
        py_need_sort = py_v[0] < 3 or (py_v[0] == 3 and py_v[1] < 6)

        supported = sorted(
            aliases.items()) if py_need_sort else aliases.items()
        tested = set()
        working = dict()

        maximum_length = len(sequences)

        for support in supported:

            k, p = support

            if p in tested:
                continue

            tested.add(p)

            try:
                str(sequences, encoding=p)
            except UnicodeDecodeError:
                continue
            except LookupError:
                continue

            chaos_measures = list()
            ranges_encountered_t = dict()
            decoded_len_t = 0

            for i in range(0, maximum_length, int(maximum_length / steps)):

                chunk = sequences[i:i + chunk_size]
                decoded = str(chunk, encoding=p, errors='ignore')

                probe_chaos = ProbeChaos(decoded)
                chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences

                for k, e in ranges_encountered.items():
                    if k not in ranges_encountered_t.keys():
                        ranges_encountered_t[k] = 0
                    ranges_encountered_t[k] += e

                if chaos_measure > threshold:
                    if p in working.keys():
                        del working[p]
                    break

                chaos_measures.append(chaos_measure)

                if p not in working.keys():
                    working[p] = dict()

            if p in working.keys():
                working[p]['ratio'] = statistics.mean(chaos_measures)
                working[p]['ranges'] = ranges_encountered_t
                working[p]['chaos'] = sum(chaos_measures)
                working[p]['len'] = decoded_len_t

            if p == 'ascii' and working[p]['ratio'] == 0.:
                break

        return CharsetNormalizerMatches([
            CharsetNormalizerMatch(sequences, enc, working[enc]['ratio'],
                                   working[enc]['ranges'])
            for enc in (
                sorted(working.keys()) if py_need_sort else working.keys())
            if working[enc]['ratio'] <= threshold
        ])
Exemplo n.º 8
0
 def chaos_secondary_pass(self):
     """
     Check once again chaos in decoded text, except this time, with full content.
     :return:
     """
     return ProbeChaos(str(self))