def normalize(text, lowercase=True, collapse=True, latinize=False, ascii=False, decompose=False, replace_categories=UNICODE_CATEGORIES): """The main normalization function for text. This will take a string and apply a set of transformations to it so that it can be processed more easily afterwards. Arguments: * ``lowercase``: not very mysterious. * ``collapse``: replace multiple whitespace-like characters with a single whitespace. This is especially useful with category replacement which can lead to a lot of whitespace. * ``decompose``: apply a unicode normalization (NFKD) to separate simple characters and their diacritics. * ``replace_categories``: This will perform a replacement of whole classes of unicode characters (e.g. symbols, marks, numbers) with a given character. It is used to replace any non-text elements of the input string. """ if not isinstance(text, six.string_types): return # TODO: Python 3? if six.PY2 and not isinstance(text, six.text_type): encoding = guess_encoding(text, 'utf-8') text = text.decode(encoding) if lowercase: # Yeah I made a Python package for this. text = text.lower() if decompose: text = decompose_nfkd(text) if ascii: # A stricter form of transliteration that leaves only ASCII # characters. text = ascii_text(text) elif latinize: # Perform unicode-based transliteration, e.g. of cyricllic # or CJK scripts into latin. text = latinize_text(text) # Perform unicode category-based character replacement. This is # used to filter out whole classes of characters, such as symbols, # punctuation, or whitespace-like characters. text = category_replace(text, replace_categories) if collapse: # Remove consecutive whitespace. text = collapse_spaces(text) return text
def reactions_for_message_content( content: str, emoji_map: Dict[str, discord.Emoji], reaction_options: Dict[str, Dict[str, Any]], ) -> List[discord.Emoji]: # extract salt 😎 searchtext = content.replace("ꜞ", "i").replace("\u2006", " ") # 6-bit distortion 🎸 searchtext = ascii_text(searchtext).lower() log.debug("searchtext transformed %r -> %r", content, searchtext) reactions_by_index = {} for ename in set(emoji_map).intersection(set(reaction_options)): start = 0 overrides = reaction_options.get(ename) while True: try: idx = searchtext.index(ename, start) except ValueError: break start = idx + len(ename) override_found = False if overrides: nobefore = overrides.get("nobefore", []) noafter = overrides.get("noafter", []) for override in nobefore: first = idx - len(override) - 1 if first >= 0: searchbefore = searchtext[first:idx] if searchbefore == override + " ": override_found = True break if not override_found: for override in noafter: last = start + len(override) + 1 if last <= len(searchtext): searchafter = searchtext[start:last] if searchafter == " " + override: override_found = True break inside_an_emoji = (idx > 0 and searchtext[idx - 1] == ":" and start < len(searchtext) and searchtext[start] == ":") if inside_an_emoji: override_found = True if idx > 0 and searchtext[idx - 1].isalnum(): override_found = True elif searchtext[start:start + 1].isalnum(): override_found = True if not override_found: reactions_by_index[idx] = ename reactions = [] for idx, ename in sorted(reactions_by_index.items()): emoji = emoji_map[ename] if emoji not in reactions: reactions.append(emoji) return reactions
def normalize(text: Any, lowercase: bool = True, collapse: bool = True, latinize: bool = False, ascii: bool = False, encoding_default: Encoding = DEFAULT_ENCODING, encoding: Optional[str] = None, replace_categories: Categories = UNICODE_CATEGORIES): """The main normalization function for text. This will take a string and apply a set of transformations to it so that it can be processed more easily afterwards. Arguments: * ``lowercase``: not very mysterious. * ``collapse``: replace multiple whitespace-like characters with a single whitespace. This is especially useful with category replacement which can lead to a lot of whitespace. * ``decompose``: apply a unicode normalization (NFKD) to separate simple characters and their diacritics. * ``replace_categories``: This will perform a replacement of whole classes of unicode characters (e.g. symbols, marks, numbers) with a given character. It is used to replace any non-text elements of the input string. """ text = stringify(text, encoding_default=encoding_default, encoding=encoding) if text is None: return if lowercase: # Yeah I made a Python package for this. text = text.lower() if ascii: # A stricter form of transliteration that leaves only ASCII # characters. text = ascii_text(text) elif latinize: # Perform unicode-based transliteration, e.g. of cyricllic # or CJK scripts into latin. text = latinize_text(text) if text is None: return # Perform unicode category-based character replacement. This is # used to filter out whole classes of characters, such as symbols, # punctuation, or whitespace-like characters. text = category_replace(text, replace_categories) if collapse: # Remove consecutive whitespace. text = collapse_spaces(text) return text
def _safe_name(file_name: Optional[str], sep: str) -> Optional[str]: """Convert the file name to ASCII and normalize the string.""" file_name = stringify(file_name) if file_name is None: return None file_name = ascii_text(file_name) file_name = category_replace(file_name, UNICODE_CATEGORIES) file_name = collapse_spaces(file_name) if file_name is None or not len(file_name): return None return file_name.replace(WS, sep)