Exemplo n.º 1
0
def normalize_lat(text: str) -> str:
    """The function for all default Latin normalization.

    TODO: Add parameters for stripping macrons, other unlikely chars. Perhaps use ``remove_non_ascii()``.
    """
    text_cltk_normalized = cltk_normalize(text=text)  # type: str
    return text_cltk_normalized
Exemplo n.º 2
0
def normalize_lat(
    text: str,
    drop_accents: bool = False,
    drop_macrons: bool = False,
    jv_replacement: bool = False,
    ligature_replacement: bool = False,
) -> str:
    """The function for all default Latin normalization.

    >>> text = "canō Īuliī suspensám quăm aegérrume ĭndignu îs óccidentem frúges Julius Caesar. In vino veritas. mæd prœil"
    >>> normalize_lat(text)
    'canō Īuliī suspensám quăm aegérrume ĭndignu îs óccidentem frúges Julius Caesar. In vino veritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True)
    'canō Īuliī suspensam quăm aegerrume ĭndignu is óccidentem frúges Julius Caesar. In vino veritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True, drop_macrons=True)
    'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Julius Caesar. In vino veritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True, drop_macrons=True, jv_replacement=True)
    'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Iulius Caesar. In uino ueritas. mæd prœil'

    >>> normalize_lat(text, drop_accents=True, drop_macrons=True, jv_replacement=True, ligature_replacement=True)
    'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Iulius Caesar. In uino ueritas. maed proeil'

    """
    text_cltk_normalized: str = cltk_normalize(text=text)
    # text_cltk_normalized = split_trailing_punct(text=text_cltk_normalized)
    # text_cltk_normalized = split_leading_punct(text=text_cltk_normalized)
    text_cltk_normalized = remove_odd_punct(text=text_cltk_normalized)
    if drop_macrons:
        text_cltk_normalized = remove_macrons(text_cltk_normalized)
    if drop_accents:
        text_cltk_normalized = remove_accents(text_cltk_normalized)
    if jv_replacement:
        text_cltk_normalized = JV_REPLACER.replace(text_cltk_normalized)
    if ligature_replacement:
        text_cltk_normalized = LIGATURE_REPLACER.replace(text_cltk_normalized)
    return text_cltk_normalized
Exemplo n.º 3
0
def normalize_grc(text: str) -> str:
    """The function for all default Greek normalization."""
    text_cltk_normalized = cltk_normalize(text=text)  # type: str
    text_oxia_converted = tonos_oxia_converter(
        text=text_cltk_normalized)  # type: str
    return text_oxia_converted
Exemplo n.º 4
0
def normalize_grc(text: str) -> str:
    """The function for all default Greek normalization."""
    text_oxia_converted = tonos_oxia_converter(text=text)  # type: str
    text_oxia_converted_norm = cltk_normalize(text=text_oxia_converted)
    text_punct_processed = remove_odd_punct(text=text_oxia_converted_norm)
    return text_punct_processed