def normalize_lat(text: str) -> str: """The function for all default Latin normalization. TODO: Add parameters for stripping macrons, other unlikely chars. Perhaps use ``remove_non_ascii()``. """ text_cltk_normalized = cltk_normalize(text=text) # type: str return text_cltk_normalized
def normalize_lat( text: str, drop_accents: bool = False, drop_macrons: bool = False, jv_replacement: bool = False, ligature_replacement: bool = False, ) -> str: """The function for all default Latin normalization. >>> text = "canō Īuliī suspensám quăm aegérrume ĭndignu îs óccidentem frúges Julius Caesar. In vino veritas. mæd prœil" >>> normalize_lat(text) 'canō Īuliī suspensám quăm aegérrume ĭndignu îs óccidentem frúges Julius Caesar. In vino veritas. mæd prœil' >>> normalize_lat(text, drop_accents=True) 'canō Īuliī suspensam quăm aegerrume ĭndignu is óccidentem frúges Julius Caesar. In vino veritas. mæd prœil' >>> normalize_lat(text, drop_accents=True, drop_macrons=True) 'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Julius Caesar. In vino veritas. mæd prœil' >>> normalize_lat(text, drop_accents=True, drop_macrons=True, jv_replacement=True) 'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Iulius Caesar. In uino ueritas. mæd prœil' >>> normalize_lat(text, drop_accents=True, drop_macrons=True, jv_replacement=True, ligature_replacement=True) 'cano Iulii suspensam quăm aegerrume ĭndignu is óccidentem frúges Iulius Caesar. In uino ueritas. maed proeil' """ text_cltk_normalized: str = cltk_normalize(text=text) # text_cltk_normalized = split_trailing_punct(text=text_cltk_normalized) # text_cltk_normalized = split_leading_punct(text=text_cltk_normalized) text_cltk_normalized = remove_odd_punct(text=text_cltk_normalized) if drop_macrons: text_cltk_normalized = remove_macrons(text_cltk_normalized) if drop_accents: text_cltk_normalized = remove_accents(text_cltk_normalized) if jv_replacement: text_cltk_normalized = JV_REPLACER.replace(text_cltk_normalized) if ligature_replacement: text_cltk_normalized = LIGATURE_REPLACER.replace(text_cltk_normalized) return text_cltk_normalized
def normalize_grc(text: str) -> str: """The function for all default Greek normalization.""" text_cltk_normalized = cltk_normalize(text=text) # type: str text_oxia_converted = tonos_oxia_converter( text=text_cltk_normalized) # type: str return text_oxia_converted
def normalize_grc(text: str) -> str: """The function for all default Greek normalization.""" text_oxia_converted = tonos_oxia_converter(text=text) # type: str text_oxia_converted_norm = cltk_normalize(text=text_oxia_converted) text_punct_processed = remove_odd_punct(text=text_oxia_converted_norm) return text_punct_processed