Пример #1
0
    def _probe(self):

        for el in self._words:

            w_len = len(el)
            classification = UnicodeRangeIdentify.classification(el)

            c_ = 0

            is_latin_based = all(
                ['Latin' in el for el in list(classification.keys())])

            if len(classification.keys()) > 1:
                for u_name, u_occ in classification.items():

                    if UnicodeRangeIdentify.is_range_secondary(u_name) is True:
                        c_ += u_occ

            if (not is_latin_based and c_ > int(w_len / 4)) \
                    or (is_latin_based and c_ > int(w_len / 2)) \
                    or (UnicodeRangeIdentify.part_punc(el) > 0.4 and len(classification.keys()) > 1) \
                    or (not is_latin_based and UnicodeRangeIdentify.part_accent(el) > 0.4) \
                    or (not is_latin_based and len(el) > 10 and UnicodeRangeIdentify.part_lonely_range(el) > 0.3):
                self._suspicious.append(el)
            else:
                pass
Пример #2
0
    def test_list_by_range(self):

        self.assertEqual(
            {'Basic Latin': ['a', 'b', 'c', 'd', 'e', 'é', 'ù'], 'Hangul Syllables': ['역', '사'],
             'Greek and Coptic': ['π', 'ο', 'υ']},
            UnicodeRangeIdentify.list_by_range(['a', 'b', 'c', 'd', 'e', 'é', 'ù', '역', '사', 'π', 'ο', 'υ'])
        )
    def test_should_be_accented(self):

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('é')
        )

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('è')
        )

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('è')
        )

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('à')
        )

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('À')
        )

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('Ù')
        )

        self.assertTrue(
            UnicodeRangeIdentify.is_accentuated('ç')
        )
 def test_should_throw(self):
     with self.assertRaises(IOError):
         UnicodeRangeIdentify.is_accentuated('àé')
     with self.assertRaises(IOError):
         UnicodeRangeIdentify.is_accentuated('aé')
     with self.assertRaises(IOError):
         UnicodeRangeIdentify.is_accentuated('aa')
    def alphabet_coverage(self):
        list_by_range = UnicodeRangeIdentify.list_by_range(self.letters)
        coverages = dict()

        for u_range, letters in list_by_range.items():
            n_covered = 0
            for l in letters:
                if l in self.covered_letters:
                    n_covered += 1

            coverages[u_range] = n_covered / len(
                letters) >= COHERENCE_ALPHABET_COVERED_IF

        return coverages
Пример #6
0
 def ratio(self):
     """
     Return a value between 0. and 1.
     Closest to 1. means that the initial string is considered as chaotic,
     Closest to 0. means that the initial string SEEMS NOT chaotic.
     :return: Ratio as floating number
     :rtype: float
     """
     r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
     z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(
         len(self._string), self.encountered_unicode_range_occurrences)
     return (r_ + self.successive_upper_lower + self.successive_accent +
             self.successive_different_unicode_range +
             self.not_encountered_white_space +
             self.unprintable + z_ + self._unravel_cjc_suspicious()) / len(
                 self._string)  # + len(self.encountered_unicode_range)-1
Пример #7
0
    def _probe(self):

        c__ = False
        upper_lower_m = False

        for c, i_ in zip(self._string, range(0, len(self._string))):

            if not c__:
                state_ = (i_ / len(self._string) >= 0.5)

            # If we already have measured 10 % or more of chaos after reading 50 %, give up.
            if not c__ and state_ and self.ratio >= self._threshold:
                self.gave_up = True
                break
            elif c__ is False and state_:
                c__ = True

            self.total_letter_encountered += 1

            if not c.isprintable():
                if c not in ['\n', '\t', '\r']:
                    if not UnicodeRangeIdentify.is_cjk(
                            c) and not UnicodeRangeIdentify.is_punc(c):
                        self.unprintable += 2

                self.encountered_white_space += 1
                self.not_encountered_white_space = 0
                self.not_encountered_white_space_reset += 1
                continue

            if c.isspace():
                self.encountered_white_space += 1
                self.not_encountered_white_space = 0
                self.not_encountered_white_space_reset += 1
                self.previous_printable_letter = c
                continue

            if self.not_encountered_white_space_reset < 2:
                self.not_encountered_white_space += 1

            if self.previous_printable_letter is None:
                self.previous_printable_letter = c
                continue

            is_accent = UnicodeRangeIdentify.is_accentuated(c)
            u_name = UnicodeRangeIdentify.find_letter_type(c)

            is_upper = c.isupper()
            is_lower = c.islower() if not is_upper else False
            is_alpha = c.isalpha()
            is_latin = UnicodeRangeIdentify.is_latin(c)

            if u_name is not None and u_name not in self.encountered_unicode_range:
                self.encountered_unicode_range_occurrences[u_name] = 0
                self.encountered_unicode_range.add(u_name)

            if is_accent and UnicodeRangeIdentify.is_accentuated(
                    self.previous_printable_letter):
                self.successive_accent += 2

            if is_lower:
                self.total_lower_letter_encountered += 1

            if is_upper and is_accent:
                self.total_upper_accent_encountered += 1
                if self.previous_printable_letter.isalpha():
                    self.total_upper_accent_encountered_inner += 1
            elif not is_accent and is_alpha:
                self.total_unaccented_letter_encountered += 1

            if u_name is not None:
                self.encountered_unicode_range_occurrences[u_name] += 1

                is_punc = UnicodeRangeIdentify.is_punc(c)

                if is_punc is True:
                    self.encountered_punc_sign += 1
                    self.encountered_white_space += 1
                    self.not_encountered_white_space = 0
                    self.not_encountered_white_space_reset += 1
                    continue

                if (is_lower and self.previous_printable_letter.isupper()) or (
                        is_upper and self.previous_printable_letter.islower()):
                    if not upper_lower_m:
                        upper_lower_m = True
                    else:
                        self.successive_upper_lower += 1
                        upper_lower_m = False
                else:
                    upper_lower_m = False

                if is_latin:
                    self.previous_encountered_unicode_range = u_name
                    self.previous_printable_letter = c

                if self.previous_encountered_unicode_range is not None and UnicodeRangeIdentify.is_suspiciously_successive_range(
                        u_name,
                        self.previous_encountered_unicode_range) is True:

                    if not UnicodeRangeIdentify.is_punc(
                            self.previous_printable_letter):
                        self.successive_different_unicode_range += 1

            self.previous_encountered_unicode_range = u_name
            self.previous_printable_letter = c

        if len(self._string) < 50:
            self.not_encountered_white_space = 0
        if self.successive_upper_lower < 3:
            self.successive_upper_lower = 0
Пример #8
0
    def _probe(self):

        c__ = False

        for c, i_ in zip(self._string, range(0, len(self._string))):
            state_ = (i_ / len(self._string) >= 0.5)
            if not c__ and state_ > 0.2 and self.ratio >= 0.3:
                self.gave_up = True
                break
            elif c__ is False and state_ > 0.2:
                c__ = True

            self.total_letter_encountered += 1

            if not c.isprintable():
                if c not in ['\n', '\t', '\r']:
                    u_name = UnicodeRangeIdentify.find_letter_type(c)
                    if 'CJK' not in u_name and 'General Punctuation' not in u_name and ord(
                            c) != 160:  # CJC have there own white spaces
                        self.unprintable += 2

                self.encountered_white_space += 1
                self.not_encountered_white_space = 0
                self.not_encountered_white_space_reset += 1
                continue

            if c.isspace():
                self.encountered_white_space += 1
                self.not_encountered_white_space = 0
                self.not_encountered_white_space_reset += 1
                self.previous_printable_letter = c
                continue

            if self.not_encountered_white_space_reset < 2:
                self.not_encountered_white_space += 1

            if self.previous_printable_letter is None:
                self.previous_printable_letter = c
                continue

            is_accent = UnicodeRangeIdentify.is_accentuated(c)
            u_name = UnicodeRangeIdentify.find_letter_type(c)
            u_name_lower = u_name.lower() if u_name is not None else None

            is_upper = c.isupper()
            is_lower = c.islower() if not is_upper else False
            is_alpha = c.isalpha()

            if u_name is not None and u_name not in self.encountered_unicode_range:
                self.encountered_unicode_range_occurrences[u_name] = 0
                self.encountered_unicode_range.add(u_name)

            if is_accent and UnicodeRangeIdentify.is_accentuated(
                    self.previous_printable_letter):
                self.successive_accent += 2

            if is_lower:
                self.total_lower_letter_encountered += 1

            if is_upper and is_accent:
                self.total_upper_accent_encountered += 1
                if self.previous_printable_letter.isalpha():
                    self.total_upper_accent_encountered_inner += 1
            elif not is_accent and is_alpha:
                self.total_unaccented_letter_encountered += 1

            if u_name is not None:
                self.encountered_unicode_range_occurrences[u_name] += 1

                if 'symbols and punctuation' in u_name_lower or 'general punctuation' in u_name_lower or 'halfwidth and fullwidth forms' in u_name_lower:
                    self.encountered_white_space += 1
                    self.not_encountered_white_space = 0
                    self.not_encountered_white_space_reset += 1

                if 'latin' in u_name_lower or 'halfwidth and fullwidth forms' in u_name_lower or 'symbols and punctuation' in u_name_lower or 'general punctuation' in u_name_lower:
                    self.previous_printable_letter = c
                    continue
                elif (self.previous_printable_letter.isupper() and c.islower()
                      ) or (self.previous_printable_letter.islower()
                            and c.isupper()):
                    self.successive_upper_lower += 1

                if u_name != self.previous_encountered_unicode_range and self.previous_encountered_unicode_range is not None:
                    k__ = self.previous_encountered_unicode_range
                    if 'latin' not in k__ and \
                            'halfwidth and fullwidth forms' not in k__ and \
                            'symbols and punctuation' not in k__ and \
                            'general punctuation' not in k__:
                        self.successive_different_unicode_range += 1

            self.previous_encountered_unicode_range = u_name
            self.previous_printable_letter = c