示例#1
0
def decompose_nfkd(text: Any) -> Optional[str]:
    """Perform unicode compatibility decomposition.

    This will replace some non-standard value representations in unicode and
    normalise them, while also separating characters and their diacritics into
    two separate codepoints.
    """
    if not is_text(text):
        return None
    return unicodedata.normalize('NFKD', text)
示例#2
0
def latinize_text(text: Optional[str], ascii: bool = False) -> Optional[str]:
    """Transliterate the given text to the latin script.

    This attempts to convert a given text to latin script using the
    closest match of characters vis a vis the original script.
    """
    if text is None or not is_text(text) or not len(text):
        return text

    if ascii:
        if not hasattr(latinize_text, "_ascii"):
            latinize_text._ascii = make_trans(ASCII_SCRIPT)  # type: ignore
        return latinize_text._ascii(text)  # type: ignore

    if not hasattr(latinize_text, "_tr"):
        latinize_text._tr = make_trans("Any-Latin")  # type: ignore
    return latinize_text._tr(text)  # type: ignore
示例#3
0
def category_replace(
        text: Any,
        replacements: Categories = UNICODE_CATEGORIES) -> Optional[str]:
    """Remove characters from a string based on unicode classes.

    This is a method for removing non-text characters (such as punctuation,
    whitespace, marks and diacritics) from a piece of text by class, rather
    than specifying them individually.
    """
    text = decompose_nfkd(text)
    if not is_text(text):
        return None
    characters = []
    for character in text:
        cat = unicodedata.category(character)
        replacement = replacements.get(cat, character)
        if replacement is not None:
            characters.append(replacement)
    return u''.join(characters)
示例#4
0
def ascii_text(text: Optional[str]) -> Optional[str]:
    """Transliterate the given text and make sure it ends up as ASCII."""
    text = latinize_text(text, ascii=True)
    if text is None or not is_text(text):
        return None
    return text.encode("ascii", "ignore").decode("ascii")
示例#5
0
def collapse_spaces(text: Any) -> Optional[str]:
    """Remove newlines, tabs and multiple spaces with single spaces."""
    if not is_text(text):
        return None
    return COLLAPSE_RE.sub(WS, text).strip(WS)
示例#6
0
def remove_byte_order_mark(text) -> Optional[str]:
    """Remove a BOM from the beginning of the text."""
    if not is_text(text):
        return None
    return BOM_RE.sub('', text)
示例#7
0
def remove_unsafe_chars(text) -> Optional[str]:
    """Remove unsafe unicode characters from a piece of text."""
    if not is_text(text):
        return None
    return UNSAFE_RE.sub('', text)
示例#8
0
def strip_quotes(text: Any) -> Optional[str]:
    """Remove double or single quotes surrounding a string."""
    if not is_text(text):
        return None
    return QUOTES_RE.sub('\\1', text)
示例#9
0
def compose_nfkc(text: Any) -> Optional[str]:
    """Perform unicode composition."""
    if not is_text(text):
        return None
    return unicodedata.normalize('NFKC', text)