Exemplo n.º 1
0
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
    """
    Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
    One containing the latin letters and the other hebrew.
    """
    layers = {}  # type: Dict[str, str]

    for character in decoded_sequence:
        if character.isalpha() is False:
            continue

        character_range = unicode_range(character)  # type: str

        layer_target_range = None  # type: Optional[str]

        for discovered_range in layers:
            if is_suspiciously_successive_range(discovered_range,
                                                character_range) is False:
                layer_target_range = discovered_range
                break

        if layer_target_range is None:
            layer_target_range = character_range

        if layer_target_range not in layers:
            layers[layer_target_range] = character.lower()
            continue

        layers[layer_target_range] += character.lower()

    return list(layers.values())
Exemplo n.º 2
0
def encoding_unicode_range(iana_name: str) -> List[str]:
    """
    Return associated unicode ranges in a single byte code page.
    """
    if is_multi_byte_encoding(iana_name):
        raise IOError("Function not supported on multi-byte code page")

    decoder = importlib.import_module(
        'encodings.{}'.format(iana_name)).IncrementalDecoder  # type: ignore

    p = decoder(errors="ignore")  # type: IncrementalDecoder
    seen_ranges = set()  # type: Set[str]

    for i in range(48, 255):
        chunk = p.decode(bytes([i]))  # type: str

        if chunk:
            character_range = unicode_range(chunk)  # type: Optional[str]

            if character_range is None:
                continue

            if is_unicode_range_secondary(character_range) is False:
                seen_ranges.add(character_range)

    return sorted(list(seen_ranges))
Exemplo n.º 3
0
 def alphabets(self) -> List[str]:
     if self._unicode_ranges is not None:
         return self._unicode_ranges
     detected_ranges = set()  # type: Set[str]
     for character in str(self):
         detected_ranges.add(unicode_range(character))
     self._unicode_ranges = sorted(list(detected_ranges))
     return self._unicode_ranges
Exemplo n.º 4
0
    def feed(self, character: str) -> None:
        self._character_count += 1

        if self._last_printable_seen is None:
            self._last_printable_seen = character
            return

        if character.isspace() or is_punctuation(character):
            self._last_printable_seen = None
            return

        unicode_range_a = unicode_range(
            self._last_printable_seen)  # type: Optional[str]
        unicode_range_b = unicode_range(character)  # type: Optional[str]

        if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
            self._suspicious_successive_range_count += 1

        self._last_printable_seen = character
Exemplo n.º 5
0
def unicode_range_languages(primary_range: str) -> List[str]:
    """
    Return inferred languages used with a unicode range.
    """
    languages = []  # type: List[str]

    for language, characters in FREQUENCIES.items():
        for character in characters:
            if unicode_range(character) == primary_range:
                languages.append(language)
                break

    return languages