Python cltk_normalize示例

编程语言: Python

命名空间/包名称: cltk.alphabet.text_normalization

方法/功能: cltk_normalize

hotexamples.com的示例: 4

Python cltk_normalize - 已找到4个示例。这些是从开源项目中提取的最受好评的cltk.alphabet.text_normalization.cltk_normalize现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： lat.py 项目： free-variation/cltk

def normalize_lat(text: str) -> str:
    """The function for all default Latin normalization.

    TODO: Add parameters for stripping macrons, other unlikely chars. Perhaps use ``remove_non_ascii()``.
    """
    text_cltk_normalized = cltk_normalize(text=text)  # type: str
    return text_cltk_normalized

示例#2

显示文件

文件： lat.py 项目： diyclassics/cltk

def normalize_lat(
    text: str,
    drop_accents: bool = False,
    drop_macrons: bool = False,
    jv_replacement: bool = False,
    ligature_replacement: bool = False,
) -> str:
    """The function for all default Latin normalization.

    >>> text = "canō Īuliī suspensám quăm aegérrume ĭndignu îs óccidentem frúges Julius Caesar. In vino veritas. mæd prœil"
    >>> normalize_lat(text)
    'canō Īuliī suspensám quăm aegérrume ĭndignu îs óccidentem frúges Julius Caesar. In vino veritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True)
    'canō Īuliī suspensam quăm aegerrume ĭndignu is óccidentem frúges Julius Caesar. In vino veritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True, drop_macrons=True)
    'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Julius Caesar. In vino veritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True, drop_macrons=True, jv_replacement=True)
    'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Iulius Caesar. In uino ueritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True, drop_macrons=True, jv_replacement=True, ligature_replacement=True)
    'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Iulius Caesar. In uino ueritas. maed proeil'

    """
    text_cltk_normalized: str = cltk_normalize(text=text)
    # text_cltk_normalized = split_trailing_punct(text=text_cltk_normalized)
    # text_cltk_normalized = split_leading_punct(text=text_cltk_normalized)
    text_cltk_normalized = remove_odd_punct(text=text_cltk_normalized)
    if drop_macrons:
        text_cltk_normalized = remove_macrons(text_cltk_normalized)
    if drop_accents:
        text_cltk_normalized = remove_accents(text_cltk_normalized)
    if jv_replacement:
        text_cltk_normalized = JV_REPLACER.replace(text_cltk_normalized)
    if ligature_replacement:
        text_cltk_normalized = LIGATURE_REPLACER.replace(text_cltk_normalized)
    return text_cltk_normalized

示例#3

显示文件

def normalize_grc(text: str) -> str:
    """The function for all default Greek normalization."""
    text_cltk_normalized = cltk_normalize(text=text)  # type: str
    text_oxia_converted = tonos_oxia_converter(
        text=text_cltk_normalized)  # type: str
    return text_oxia_converted

示例#4

显示文件

def normalize_grc(text: str) -> str:
    """The function for all default Greek normalization."""
    text_oxia_converted = tonos_oxia_converter(text=text)  # type: str
    text_oxia_converted_norm = cltk_normalize(text=text_oxia_converted)
    text_punct_processed = remove_odd_punct(text=text_oxia_converted_norm)
    return text_punct_processed