예제 #1
0
def slugify(text, sep='-'):
    """A simple slug generator."""
    text = stringify(text)
    if text is None:
        return None
    text = text.replace(sep, WS)
    text = normalize(text, ascii=True)
    if text is None:
        return None
    return text.replace(WS, sep)
예제 #2
0
def normalize(text: Any,
              lowercase: bool = True,
              collapse: bool = True,
              latinize: bool = False,
              ascii: bool = False,
              encoding_default: Encoding = DEFAULT_ENCODING,
              encoding: Optional[str] = None,
              replace_categories: Categories = UNICODE_CATEGORIES):
    """The main normalization function for text.

    This will take a string and apply a set of transformations to it so
    that it can be processed more easily afterwards. Arguments:

    * ``lowercase``: not very mysterious.
    * ``collapse``: replace multiple whitespace-like characters with a
      single whitespace. This is especially useful with category replacement
      which can lead to a lot of whitespace.
    * ``decompose``: apply a unicode normalization (NFKD) to separate
      simple characters and their diacritics.
    * ``replace_categories``: This will perform a replacement of whole
      classes of unicode characters (e.g. symbols, marks, numbers) with a
      given character. It is used to replace any non-text elements of the
      input string.
    """
    text = stringify(text,
                     encoding_default=encoding_default,
                     encoding=encoding)
    if text is None:
        return

    if lowercase:
        # Yeah I made a Python package for this.
        text = text.lower()

    if ascii:
        # A stricter form of transliteration that leaves only ASCII
        # characters.
        text = ascii_text(text)
    elif latinize:
        # Perform unicode-based transliteration, e.g. of cyricllic
        # or CJK scripts into latin.
        text = latinize_text(text)

    if text is None:
        return

    # Perform unicode category-based character replacement. This is
    # used to filter out whole classes of characters, such as symbols,
    # punctuation, or whitespace-like characters.
    text = category_replace(text, replace_categories)

    if collapse:
        # Remove consecutive whitespace.
        text = collapse_spaces(text)
    return text
예제 #3
0
파일: paths.py 프로젝트: zanachka/normality
def _safe_name(file_name: Optional[str], sep: str) -> Optional[str]:
    """Convert the file name to ASCII and normalize the string."""
    file_name = stringify(file_name)
    if file_name is None:
        return None
    file_name = ascii_text(file_name)
    file_name = category_replace(file_name, UNICODE_CATEGORIES)
    file_name = collapse_spaces(file_name)
    if file_name is None or not len(file_name):
        return None
    return file_name.replace(WS, sep)
예제 #4
0
def slugify(value: Any, sep: str = "-") -> Optional[str]:
    """A simple slug generator. Slugs are pure ASCII lowercase strings
    that can be used in URLs an other places where a name has to be
    machine-safe."""
    text = stringify(value)
    if text is None:
        return None
    text = text.replace(sep, WS)
    # run this first because it'll give better results on special
    # characters.
    text = category_replace(text, SLUG_CATEGORIES)
    text = latinize_text(text, ascii=True)
    if text is None:
        return None
    text = text.lower()
    text = "".join([c for c in text if c in VALID_CHARS])
    text = collapse_spaces(text)
    if text is None or len(text) == 0:
        return None
    return text.replace(WS, sep)