def alphabets(self) -> List[str]: if self._unicode_ranges is not None: return self._unicode_ranges detected_ranges = set() # type: Set[str] for character in str(self): detected_range = unicode_range(character) # type: Optional[str] if detected_range: detected_ranges.add(unicode_range(character)) self._unicode_ranges = sorted(list(detected_ranges)) return self._unicode_ranges
def encoding_unicode_range(iana_name: str) -> List[str]: """ Return associated unicode ranges in a single byte code page. """ if is_multi_byte_encoding(iana_name): raise IOError("Function not supported on multi-byte code page") decoder = importlib.import_module( 'encodings.{}'.format(iana_name)).IncrementalDecoder # type: ignore p = decoder(errors="ignore") # type: IncrementalDecoder seen_ranges = set() # type: Set[str] for i in range(48, 255): chunk = p.decode(bytes([i])) # type: str if chunk: character_range = unicode_range(chunk) # type: Optional[str] if character_range is None: continue if is_unicode_range_secondary(character_range) is False: seen_ranges.add(character_range) return sorted(list(seen_ranges))
def alpha_unicode_split(decoded_sequence: str) -> List[str]: """ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; One containing the latin letters and the other hebrew. """ layers = {} # type: Dict[str, str] for character in decoded_sequence: if character.isalpha() is False: continue character_range = unicode_range(character) # type: str layer_target_range = None # type: Optional[str] for discovered_range in layers: if is_suspiciously_successive_range(discovered_range, character_range) is False: layer_target_range = discovered_range break if layer_target_range is None: layer_target_range = character_range if layer_target_range not in layers: layers[layer_target_range] = character.lower() continue layers[layer_target_range] += character.lower() return list(layers.values())
def feed(self, character: str) -> None: self._character_count += 1 if self._last_printable_seen is None: self._last_printable_seen = character return if character.isspace() or is_punctuation(character): self._last_printable_seen = None return unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str] unicode_range_b = unicode_range(character) # type: Optional[str] if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 self._last_printable_seen = character
def unicode_range_languages(primary_range: str) -> List[str]: """ Return inferred languages used with a unicode range. """ languages = [] # type: List[str] for language, characters in FREQUENCIES.items(): for character in characters: if unicode_range(character) == primary_range: languages.append(language) break return languages