def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) optional_sign = pynini.closure( pynini.cross("negative: \"true\" ", "menos "), 0, 1) self.optional_sign = optional_sign integer = pynini.closure(NEMO_NOT_QUOTE, 1) self.integer = pynutil.delete(" \"") + integer + pynutil.delete("\"") integer = pynutil.delete("integer:") + self.integer graph_masc = optional_sign + integer graph_fem = shift_cardinal_gender(graph_masc) self.graph_masc = pynini.optimize(graph_masc) self.graph_fem = pynini.optimize(graph_fem) # Adding adjustment for fem gender (choice of gender will be random) graph = graph_masc | graph_fem if not deterministic: # For alternate renderings when apocope is omitted (i.e. cardinal stands alone) graph |= strip_cardinal_apocope(graph_masc) # "una" will drop to "un" in unique contexts graph |= add_cardinal_apocope_fem(graph_fem) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def generator_main(exporter: grm.Exporter, token_type: pynini.TokenType): """FSTs for reading normalization of abjad / alphabet script languages.""" with pynini.default_token_type(token_type): sigma = u.sigma_from_common_data_files() for lang in u.LANGS: visual_norm_fst = _open_visual(lang, token_type) reading_norm_file = u.LANG_DIR / lang / 'reading_norm.tsv' reading_norm_fst = rule.fst_from_rule_file(reading_norm_file, sigma) lang = lang.upper() exporter[lang] = pynini.optimize( visual_norm_fst @ reading_norm_fst)
def fst_from_rules(rules: RuleSet, sigma: pynini.Fst) -> pynini.Fst: """Gets rewrite FST from given rule set representing rewrites. Args: rules: String rules representing a set of rewrites. sigma: Fst to consider the complete alphabet for CDRewrites. Returns: The Rewrite FST for the specified rule file. """ fsts = [ pynini.optimize(pynini.string_map(rule_set)) for rule_set in partition_unordered(rules) ] return ur.RewriteAndComposeFsts(fsts, sigma)
def generator_main(exporter: grm.Exporter): """FSTs for language-agnostic reversible romanization of abjad/alphabets.""" # Construct NFC transducer - it is different from the standalone FST # transducer in that it allows letters that are not abjad / alphabet. nfc_file = u.LANG_DIR / 'nfc.tsv' nfc_fst = rule.fst_from_rule_file(nfc_file, byte.BYTE) # Build language-agnostic visual normalization transducer. visual_norm_file = u.LANG_DIR / 'common' / 'visual_norm.tsv' visual_norm_fst = rule.fst_from_rule_file(visual_norm_file, byte.BYTE) # Compile romanisation transducer. In the Latin direction we apply NFC and # visual normalization first. No visual normalization is required in the # opposite direction. roman_mapping_file = u.LANG_DIR / 'reversible_roman.tsv' roman_fst = rule.fst_from_rule_file(roman_mapping_file, byte.BYTE) exporter['FROM_ARAB'] = pynini.optimize( nfc_fst @ visual_norm_fst @ roman_fst) # Transforming Latin to native is simpler. roman_strings = f.StringFile(roman_mapping_file) roman_inv_fst = pynini.invert(roman_strings).star exporter['TO_ARAB'] = roman_inv_fst.optimize()
def Rewrite(rule: pynini.FstLike, sigma: pynini.Fst = byte.BYTE, left: pynini.FstLike = "", right: pynini.FstLike = "") -> pynini.Fst: return pynini.optimize(pynini.cdrewrite(rule, left, right, sigma.star))
def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) # Derivational strings append 'avo' as a suffix. Adding space for processing aid fraction_stem = pynutil.insert(" avo") plural = pynutil.insert("s") conjunction = pynutil.insert(" y ") integer = (pynutil.delete("integer_part: \"") + strip_cardinal_apocope(pynini.closure(NEMO_NOT_QUOTE)) + pynutil.delete("\"")) numerator_one = pynutil.delete("numerator: \"") + pynini.accep( "un") + pynutil.delete("\" ") numerator = (pynutil.delete("numerator: \"") + pynini.difference(pynini.closure(NEMO_NOT_QUOTE), "un") + pynutil.delete("\" ")) denominator_add_stem = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + fraction_stem + pynutil.delete("\" morphosyntactic_features: \"add_root\"")) denominator_ordinal = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" morphosyntactic_features: \"ordinal\"")) denominator_cardinal = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) denominator_singular = pynini.union(denominator_add_stem, denominator_ordinal) if not deterministic: # Occasional exceptions denominator_singular |= denominator_add_stem @ pynini.string_map( [("once avo", "undécimo"), ("doce avo", "duodécimo")]) denominator_plural = denominator_singular + plural # Merging operations merge = pynini.cdrewrite( pynini.cross(" y ", "i"), "", "", NEMO_SIGMA ) # The denominator must be a single word, with the conjunction "y" replaced by i merge @= pynini.cdrewrite(delete_space, "", pynini.difference(NEMO_CHAR, "parte"), NEMO_SIGMA) # The merger can produce duplicate vowels. This is not allowed in orthography delete_duplicates = pynini.string_map([("aa", "a"), ("oo", "o")]) # Removes vowels delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "", NEMO_SIGMA) remove_accents = pynini.cdrewrite( accents, pynini.union(NEMO_SPACE, pynini.accep("[BOS]")) + pynini.closure(NEMO_NOT_SPACE), pynini.closure(NEMO_NOT_SPACE) + pynini.union("avo", "ava", "ésimo", "ésima"), NEMO_SIGMA, ) merge_into_single_word = merge @ remove_accents @ delete_duplicates fraction_default = numerator + delete_space + insert_space + ( denominator_plural @ merge_into_single_word) fraction_with_one = (numerator_one + delete_space + insert_space + (denominator_singular @ merge_into_single_word)) fraction_with_cardinal = strip_cardinal_apocope(numerator | numerator_one) fraction_with_cardinal += ( delete_space + pynutil.insert(" sobre ") + strip_cardinal_apocope(denominator_cardinal)) if not deterministic: # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine # Other rules will manage use of "un" at end, so just worry about endings exceptions = pynini.string_map([("tercia", "tercera")]) apply_exceptions = pynini.cdrewrite(exceptions, "", "", NEMO_SIGMA) vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "", pynini.accep("[EOS]"), NEMO_SIGMA) denominator_singular_fem = shift_cardinal_gender( denominator_singular) @ vowel_change @ apply_exceptions denominator_plural_fem = denominator_singular_fem + plural numerator_one_fem = shift_cardinal_gender(numerator_one) numerator_fem = shift_cardinal_gender(numerator) fraction_with_cardinal |= ( (numerator_one_fem | numerator_fem) + delete_space + pynutil.insert(" sobre ") + shift_cardinal_gender(denominator_cardinal)) # Still need to manage stems merge_stem = pynini.cdrewrite( delete_space, "", pynini.union("avo", "ava", "avos", "avas"), NEMO_SIGMA) # For managing alternative spacing merge_stem @= remove_accents @ delete_duplicates fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( pynini.cross("una media", "media"), "", "", NEMO_SIGMA) # "media" not "una media" fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word) fraction_default_fem += pynutil.insert(" partes") fraction_default |= (numerator + delete_space + insert_space + denominator_plural @ merge_stem ) # Case of no merger fraction_default |= fraction_default_fem fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem fraction_with_one |= fraction_with_one_fem fraction_with_one @= pynini.cdrewrite(pynini.cross( "un medio", "medio"), "", "", NEMO_SIGMA) # "medio" not "un medio" fraction = fraction_with_one | fraction_default | fraction_with_cardinal graph_masc = pynini.closure(integer + delete_space + conjunction, 0, 1) + fraction # Manage cases of fem gender (only shows on integer except for "medio") integer_fem = shift_cardinal_gender(integer) fraction_default |= ( shift_cardinal_gender(numerator) + delete_space + insert_space + (denominator_plural @ pynini.cross("medios", "medias"))) fraction_with_one |= ( pynutil.delete(numerator_one) + delete_space + (denominator_singular @ pynini.cross("medio", "media"))) fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0, 1) + fraction_fem self.graph_masc = pynini.optimize(graph_masc) self.graph_fem = pynini.optimize(graph_fem) self.graph = graph_masc | graph_fem delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def brahmic_to_iso(consonant_file: os.PathLike, inherent_vowel_file: os.PathLike, vowel_sign_file: os.PathLike, vowel_file: os.PathLike, vowel_length_sign_file: os.PathLike, coda_file: os.PathLike, dead_consonant_file: os.PathLike, standalone_file: os.PathLike, subjoined_consonant_file: os.PathLike, virama_file: os.PathLike) -> pynini.Fst: """Creates an FST that transduces a Brahmic script to ISO 15919. Args: consonant_file: Path relative to the runfiles directory of a StringFile containing a native--latin consonant mapping. inherent_vowel_file: Path relative to depot of a StringFile containing the inherent vowel. vowel_sign_file: Path relative to depot of a StringFile containing a native--latin vowel matra mapping. vowel_file: Path relative to depot of a StringFile containing a native--latin independent vowel mapping. vowel_length_sign_file: Path relative to depot of a StringFile containing a native--latin vowel length sign mapping. coda_file: Path relative to depot of a StringFile containing a native--latin coda mapping. dead_consonant_file: Path relative to depot of a StringFile containing a native--latin dead consonant mapping. standalone_file: Path relative to depot of a StringFile containing a native--latin standalone string mapping. subjoined_consonant_file: Path relative to depot of a StringFile containing a native--latin subjoined consonant mapping. virama_file: Path relative to depot of a StringFile containing the virama for the script. Returns: Brahmic script to ISO FST. """ core_consonant = uf.StringFile(consonant_file) inherent_vowel = uf.StringFile(inherent_vowel_file) vowel_sign = uf.StringFile(vowel_sign_file) vowel = uf.StringFile(vowel_file) vowel_length_sign = uf.StringFile(vowel_length_sign_file) coda = uf.StringFile(coda_file) dead_consonant = uf.StringFile(dead_consonant_file) standalone = uf.StringFile(standalone_file) subjoined_consonant = uf.StringFile(subjoined_consonant_file) virama = uf.StringFile(virama_file) common_symbol = uf.StringFile(u.SCRIPT_DIR / 'common' / 'symbol.tsv') ins_inherent = pynutil.insert(inherent_vowel) ins_dash = pynutil.insert('-') ins_dot = pynutil.insert('.') del_virama = pynutil.delete(virama) virama_mark = pynini.cross(virama, '˘') low_priority_epsilon = pynini.accep('', weight=1) consonant = core_consonant + uf.QuesSafe(subjoined_consonant) convert_to_iso = pynini.union( consonant + vowel_sign, consonant + ins_inherent + low_priority_epsilon, consonant + del_virama + low_priority_epsilon, vowel + low_priority_epsilon, coda, dead_consonant, vowel_length_sign, standalone, # Rare cases: # Dangling vowel signs. ins_dash + vowel_sign + (ins_dot + vowel).star + low_priority_epsilon, virama_mark + low_priority_epsilon, # Explicit virama elsewhere. common_symbol, # Joiners. # Independent vowel not as the first letter: vowel + (ins_dot + vowel).plus + low_priority_epsilon, consonant + vowel_sign + (ins_dot + vowel).plus, consonant + del_virama + (ins_dot + vowel).plus, consonant + ins_inherent + (ins_dot + vowel).plus) return pynini.optimize(convert_to_iso.star)
onetoone_34 = pynini.transducer("ⲭ", "ch") rule_onetoone_34 = pynini.cdrewrite(onetoone_34, "", "", sigma_star) onetoone_35 = pynini.transducer("ϭ", "ky") rule_onetoone_35 = pynini.cdrewrite(onetoone_35, "", "", sigma_star) #cdrewrite cascade = pynini.optimize(rule_addwb_1@rule_addwb_2@rule_1@rule_2@rule_4@rule_5@ rule_6@rule_7@rule_8@rule_9@rule_10@rule_11@rule_12@ rule_13@rule_14@rule_18@rule_19@rule_20@rule_21@ rule_onetoone_1@rule_onetoone_2@rule_onetoone_3@ rule_onetoone_4@rule_onetoone_5@rule_onetoone_6@ rule_onetoone_7@rule_onetoone_8@rule_onetoone_9@ rule_onetoone_10@rule_onetoone_11@rule_onetoone_12@ rule_onetoone_13@rule_onetoone_14@rule_onetoone_15@ rule_onetoone_16@rule_onetoone_17@rule_onetoone_18@ rule_onetoone_19@rule_onetoone_20@rule_onetoone_21@ rule_onetoone_22@rule_onetoone_23@rule_onetoone_24@ rule_onetoone_25@rule_onetoone_26@rule_onetoone_27@ rule_onetoone_28@rule_onetoone_29@rule_onetoone_30@ rule_onetoone_31@rule_onetoone_32@rule_onetoone_33@ rule_onetoone_34@rule_onetoone_34@rule_removewb) def translit(text): text = text.lower() text = (text @ cascade) return text.string() script, filename = argv
sigma.star) # clean up rules cleanUp = pynini.cdrewrite(T(inflClass | category | feat, ""), "", "", sigma.star) # double plurals in Breton newLex = (lexicon @ numRule @ dimRule).optimize() grammar = (dimRealization @ plRealization @ cleanUp).optimize() lexForms = (newLex @ grammar).optimize() # Checking the program pynini.optimize(pynini.project(lexForms, True)) def realizeNoun(m): x = (m @ grammar).optimize() y = pynini.project(x, True) return y.stringify(token_type="utf8") arrow = "\t-->\t" # \t is the 'tab' character print("bag[sg]\t" + arrow + realizeNoun(lexicon + sg)) print("bag[pl]\t" + arrow + realizeNoun(lexicon + pl)) print("bag[sg,dim]" + arrow + realizeNoun(lexicon + sg + dim)) print("bag[pl,dim]" + arrow + realizeNoun(lexicon + pl + dim))