Пример #1
0
def generate_fst_for_factor_digit(factor, include_zero=False):
    fst = pn.Fst()
    carets = ''
    if factor > 0:
        carets = '^' * factor
        carets = carets + ' '

    for num in range(0, 10):

        # if num == 0 and include_zero is False:
        #     fst_temp = pn.t(str(num), "")
        # else:
        #     fst_temp = pn.t(str(num), str(num) + carets)
        fst_temp = pn.t(str(num), str(num) + carets)
        fst = pn.union(fst, fst_temp)

    fst = fst.optimize()

    return fst
Пример #2
0
back_vowel = pynini.union("u", "o", "a")
neutral_vowel = pynini.union("i", "e")
front_vowel = pynini.union("y", "ö", "ä")
vowel = pynini.union(back_vowel, neutral_vowel, front_vowel)
archiphoneme = pynini.union("A", "I", "E", "O", "U")
consonant = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                         "p", "q", "r", "s", "t", "v", "w", "x", "z")
sigma_star = pynini.union(vowel, consonant, archiphoneme).closure().optimize()

adessive = "llA"
intervener = pynini.union(consonant, neutral_vowel).closure()
adessive_harmony = (
    pynini.cdrewrite(pynini.transducer("A", "a"), back_vowel + intervener, "",
                     sigma_star) *
    pynini.cdrewrite(pynini.t("A", "ä"), "", "", sigma_star)).optimize()


def make_adessive(stem):
    return ((stem + adessive) * adessive_harmony).stringify()


make_adessive("training")

singular_map = pynini.union(
    pynini.transducer("feet", "foot"),
    pynini.transducer("pence", "penny"),
    # Any sequence of bytes ending in "ches" strips the "es";
    # the last argument -1 is a "weight" that gives this analysis
    # a higher priority, if it matches the input.
    sigma_star + pynini.transducer("ches", "ch", -1),
Пример #3
0
def n2w_fst():
    factor_fst = generate_fst_digit()

    # full french alphabet - https://en.wikiversity.org/wiki/French/Alphabet
    alphabet_full = pn.u(
        *".0123456789^ _-abcdefghijklmnopqrstuvwxyzàèùéâêîôûëïüÿæœç").star
    fsa_0_9 = pn.u(*"0123456789").star

    # single_zero = pn.t("0", "zéro")
    single_zero = pn.t("0", "zero")

    single_digits = pn.string_map({
        "0": "",  # zéro
        "1": "un",
        "2": "deux",
        "3": "trois",
        "4": "quatre",
        "5": "cinq",
        "6": "six",
        "7": "sept",
        "8": "huit",
        "9": "neuf",
    })

    zeros = pn.string_map({
        # "0^^ 0^ 0": "",
        "0^ ": "",
        "0^^ ": "",
        "mille_0^^ 0^ 0": "mille",
    })

    teens_10_19 = pn.string_map({
        "1^ 0": "dix",
        "1^ 1": "onze",
        "1^ 2": "douze",
        "1^ 3": "treize",
        "1^ 4": "quatorze",
        "1^ 5": "quinze",
        "1^ 6": "seize",
        "1^ 7": "dix-sept",
        "1^ 8": "dix-huit",
        "1^ 9": "dix-neuf",
    })

    mult_20_60 = pn.string_map({
        "2^ 0": "vingt",
        "2^ 1": "vingt_et_un",
        "3^ 0": "trente",
        "3^ 1": "trente_et_un",
        "4^ 0": "quarante",
        "4^ 1": "quarante_et_un",
        "5^ 0": "cinquante",
        "5^ 1": "cinquante_et_un",
        "6^ 0": "soixante",
        "6^ 1": "soixante_et_un",
    })

    mult_2x_6x = pn.string_map({
        "2^ ": "vingt-",
        "3^ ": "trente-",
        "4^ ": "quarante-",
        "5^ ": "cinquante-",
        "6^ ": "soixante-",
    })

    mult_70_90 = pn.string_map({
        "7^ 0": "soixante-dix",
        "7^ 1": "soixante_et_onze",
        "7^ 2": "soixante-douze",
        "7^ 3": "soixante-treize",
        "7^ 4": "soixante-quatorze",
        "7^ 5": "soixante-quinze",
        "7^ 6": "soixante-seize",
        "7^ 7": "soixante-dix-sept",
        "7^ 8": "soixante-dix-huit",
        "7^ 9": "soixante-dix-neuf",
        "8^ 0": "quatre-vingts",
        "9^ 0": "quatre-vingt-dix",
        "9^ 1": "quatre-vingt-onze",
        "9^ 2": "quatre-vingt-douze",
        "9^ 3": "quatre-vingt-treize",
        "9^ 4": "quatre-vingt-quatorze",
        "9^ 5": "quatre-vingt-quinze",
        "9^ 6": "quatre-vingt-seize",
        "9^ 7": "quatre-vingt-dix-sept",
        "9^ 8": "quatre-vingt-dix-huit",
        "9^ 9": "quatre-vingt-dix-neuf",
    })

    mult_8x = pn.string_map({
        "8^ ": "quatre-vingt-",
    })

    hundreds_alone = pn.string_map({
        "1^^ 0^ 0": "cent",
        "2^^ 0^ 0": "deux_cents",
        "3^^ 0^ 0": "trois_cents",
        "4^^ 0^ 0": "quatre_cents",
        "5^^ 0^ 0": "cinq_cents",
        "6^^ 0^ 0": "six_cents",
        "7^^ 0^ 0": "sept_cents",
        "8^^ 0^ 0": "huit_cents",
        "9^^ 0^ 0": "neuf_cents",
    })

    hundreds = pn.string_map({
        "1^^ ": "cent_",
        "2^^ ": "deux_cent_",
        "3^^ ": "trois_cent_",
        "4^^ ": "quatre_cent_",
        "5^^ ": "cinq_cent_",
        "6^^ ": "six_cent_",
        "7^^ ": "sept_cent_",
        "8^^ ": "huit_cent_",
        "9^^ ": "neuf_cent_",
    })

    mille = pn.string_map({
        "0^^^ ": "0^^^_mille_",
        "1^^^ ": "1^^^_mille_",
        "2^^^ ": "2^^^_mille_",
        "3^^^ ": "3^^^_mille_",
        "4^^^ ": "4^^^_mille_",
        "5^^^ ": "5^^^_mille_",
        "6^^^ ": "6^^^_mille_",
        "7^^^ ": "7^^^_mille_",
        "8^^^ ": "8^^^_mille_",
        "9^^^ ": "9^^^_mille_",
    })

    million = pn.string_map({
        "0^^^^^^ ": "0^^^^^^_millions_",
        "1^^^^^^ ": "1^^^^^^_millions_",
        "2^^^^^^ ": "2^^^^^^_millions_",
        "3^^^^^^ ": "3^^^^^^_millions_",
        "4^^^^^^ ": "4^^^^^^_millions_",
        "5^^^^^^ ": "5^^^^^^_millions_",
        "6^^^^^^ ": "6^^^^^^_millions_",
        "7^^^^^^ ": "7^^^^^^_millions_",
        "8^^^^^^ ": "8^^^^^^_millions_",
        "9^^^^^^ ": "9^^^^^^_millions_",
    })

    strip_triple_factor = pn.string_map({
        "^^^^^^^^": "^^",
        "^^^^^^^": "^",
        "^^^^^^": "",
        "^^^^^": "^^",
        "^^^^": "^",
        "^^^": "",
    })

    un_mille_million = pn.string_map({
        "un_mille": "mille",
        "un_millions": "un_million",
    })

    fixmeup = pn.string_map({
        # "zzzzz" : "xxxxxx",
        "_cent__millions__mille": "_cents_millions",
        "millions_un_mille": "millions_mille",
        # "million--mille": "million",
        "millions__mille": "millions",
        "vingts_mille": "vingt_mille",
        "cent__mille": "cent_mille",
    })

    fixmeup2 = pn.string_map({
        "million__mille": "million",
        "_cent__millions": "_cents_millions",
        "million_un_mille": "million_mille",
        "__": "_",
    })

    decimals = pn.string_map({
        # "0": "zéro ",  # zéro
        "0": "zero ",  # zéro
        "1": "un ",
        "2": "deux ",
        "3": "trois ",
        "4": "quatre ",
        "5": "cinq ",
        "6": "six ",
        "7": "sept ",
        "8": "huit ",
        "9": "neuf ",
        "_": " ",
    })

    fsa_eos = pn.a("[EOS]")
    fsa_bos = pn.a("[BOS]")
    fsa_dot_comma = pn.u(".", ",")

    fst_dot_comma = pn.cdrewrite(
        pn.u(pn.t(".", " virgule "), pn.t(",", " virgule ")), "", "",
        alphabet_full)

    fst_decimals = pn.cdrewrite(decimals, "", "", alphabet_full)

    fst_zeros = pn.cdrewrite(zeros, "", fsa_0_9 | fsa_eos | fsa_dot_comma,
                             alphabet_full)

    fst_single_zero = pn.cdrewrite(single_zero, "", fsa_eos | fsa_dot_comma,
                                   alphabet_full)

    fst_single_digits = pn.cdrewrite(single_digits, "",
                                     pn.u(fsa_eos, "-", "_", fsa_dot_comma),
                                     alphabet_full)

    fst_teens = pn.cdrewrite(teens_10_19, "", "", alphabet_full)

    fst_mult_20_60 = pn.cdrewrite(mult_20_60, "", "", alphabet_full)
    fst_mult_2x_6x = pn.cdrewrite(mult_2x_6x, "", fsa_0_9, alphabet_full)

    fst_mult_70_90 = pn.cdrewrite(mult_70_90, "", "", alphabet_full)
    fst_mult_8x = pn.cdrewrite(mult_8x, "", fsa_0_9, alphabet_full)

    fst_hundreds_alone = pn.cdrewrite(hundreds_alone, "", fsa_eos,
                                      alphabet_full)
    fst_hundreds = pn.cdrewrite(hundreds, "", fsa_0_9, alphabet_full)

    fst_mille = pn.cdrewrite(mille, "", fsa_0_9, alphabet_full)
    fst_million = pn.cdrewrite(million, "", fsa_0_9, alphabet_full)

    fst_strip_triple_factor = pn.cdrewrite(strip_triple_factor, fsa_0_9,
                                           pn.u(" ", "-", "_"), alphabet_full)

    fst_un_mille_million = pn.cdrewrite(un_mille_million, fsa_bos, "",
                                        alphabet_full)

    fst_fixmeup = pn.cdrewrite(fixmeup, "", "", alphabet_full)
    fst_fixmeup2 = pn.cdrewrite(fixmeup2, "", "", alphabet_full)

    fst = factor_fst * fst_million * fst_mille * fst_strip_triple_factor * \
        fst_hundreds_alone * fst_hundreds * \
        fst_mult_70_90 * fst_mult_8x * fst_mult_20_60 * fst_mult_2x_6x * \
        fst_teens * fst_zeros * fst_single_zero * fst_single_digits * \
        fst_un_mille_million * fst_fixmeup * fst_fixmeup2 * \
        fst_dot_comma * fst_decimals

    transformer = fst.optimize()

    ## ---------- YOUR PART ENDS------------
    return transformer
def future_perfective(stem):
    """Transducer for first-person singular future tense of perfective verbs"""

    vowels = pynini.union("а", "е", "ё", "и", "о", "у", "ы", "э", "ю", "я")
    yer = pynini.union("ь", "ъ")
    consonants = pynini.union("б", "в", "г", "д", "ж", "з", "й", "к", "л", "м",
                              "н", "п", "р", "с", "т", "ф", "х", "ц", "ч", "ш",
                              "щ")
    sigma_star = pynini.union(vowels, consonants, yer).closure()
    # Define rules for a 1SG future tense inflection
    future_tense_map = pynini.union(
        #Consonant mutation cases as mentioned in Wade, 2010
        # т : ч
        pynini.cdrewrite(pynini.t("тать", "чу"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("тить", "чу"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("теть", "чу"), "", "", sigma_star) *
        # д : ж
        pynini.cdrewrite(pynini.t("деться", "жусь"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("дить", "жу"), "", "", sigma_star) *
        # в : вл
        pynini.cdrewrite(pynini.t("вить", "влю"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("виться", "влюсь"), "", "", sigma_star) *
        # c : ш
        pynini.cdrewrite(pynini.t("саться", "шусь"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("сить", "шу"), "", "", sigma_star) *
        # м : мл
        pynini.cdrewrite(pynini.t("мить", "млю"), "", "", sigma_star) *
        # б : бл
        pynini.cdrewrite(pynini.t("бить", "блю"), "", "", sigma_star) *
        # п : пл
        pynini.cdrewrite(pynini.t("пать", "плю"), "", "", sigma_star) *

        #Consonant mutation cases not mentioned in Wade, 2010
        # ч : к (Wade, 2010 к : ч)
        pynini.cdrewrite(pynini.t("речь", "реку"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("чься", "кусь"), "", "", sigma_star) *
        # ч : г
        pynini.cdrewrite(pynini.t("ечь", "ягу"), "", "", sigma_star) *
        # х : д
        pynini.cdrewrite(pynini.t("хать", "ду"), "", "", sigma_star) *
        # c : д
        pynini.cdrewrite(pynini.t("сть", "ду"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("стить", "щу"), "", "", sigma_star) *

        #First singular form of future with ю
        pynini.cdrewrite(pynini.t("ить", "ю"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("тать", "таю"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("ртеть", "ртею"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("мыть", "мою"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("еть", "ею"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("ртеть", "ртею"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("лать", "лаю"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("питать", "питаю"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("меть", "мею"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("лоть", "лю"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("отлить", "отолью"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("ли", "лю"), "", "", sigma_star) *
        # Mutation with a soft sign
        pynini.cdrewrite(pynini.t("шить", "шью"), "", "", sigma_star) *

        #Spelling rule: у instead of ю after sibilants ж, ч, ш, щ
        pynini.cdrewrite(pynini.t("щи", "щу"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("жить", "жу"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("зить", "жу"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("чить", "чу"), "", "", sigma_star) *

        #Future with reflexive suffix
        pynini.cdrewrite(pynini.t("ся", "сь"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("иться", "юсь"), "", "", sigma_star) *
        pynini.cdrewrite(pynini.t("аться", "усь"), "", "", sigma_star) *

        #Stem change (vowel deletion)
        pynini.cdrewrite(pynini.t("тереть", "тру"), "", "", sigma_star) *

        #Verbs ending in -дать form future with -м
        pynini.cdrewrite(pynini.t("дать", "дам"), "", "", sigma_star) *

        #Deletion of two last letters in the infinitive stem
        pynini.cdrewrite(pynini.t("ть", ""), "", "", sigma_star), ).optimize()

    return (stem * future_tense_map).stringify()
sigma = pynini.union(coptic_sigma, latin_sigma, punct_whitespace_sigma, vowels, ipa_sigma, wb)

sigma_star = pynini.closure(sigma)


#rules

insert_wb = pynini.transducer("", "[WB]")
# pynini.t("", "[WB]") + sigma_star + pynini.t("", "[WB]")
# Add WB when coptic letters are on the left and whitespace or punctuation are on the right
rule_addwb_1 = pynini.cdrewrite(insert_wb, coptic_sigma, punct_whitespace_sigma, sigma_star)
# Add WB when whitespace or punctuation are on the left and coptic letters are on the right
rule_addwb_2 = pynini.cdrewrite(insert_wb, punct_whitespace_sigma, coptic_sigma, sigma_star)

rule_removewb = pynini.cdrewrite(pynini.t("[WB]", ""), "", "", sigma_star)

#alpha

alphatoa_1 = pynini.transducer("ⲁ", "æ")
rule_1 = pynini.cdrewrite(alphatoa_1, "", "ⲥ[WB]", sigma_star)

alphatoa_2 = pynini.transducer("ⲁ", "ə")
rule_2 = pynini.cdrewrite(alphatoa_2, "", wb, sigma_star)

alphatoa_3 = pynini.transducer("ⲁ", "ɛ")
###rule_3###

alphatoa_4 = pynini.transducer("ⲁ", "ɑː")
rule_4 = pynini.cdrewrite(alphatoa_4, "", "", sigma_star)
Пример #6
0
# map_10_to_19
# map_20_to_90

# Now, define a FST that uses the mapper FSTs to transform factorized form to
# verbalized form:
# 0    -> zero
# 1^   -> ten
# 1^ 1 -> eleven
# 9^ 1 -> ninety one
# 1^^ 9^ 1 -> ['one hundred ninety one', 'hundred ninety one']
# TODO: currently only works for single digits (and doesn't work for zero)

a1_9 = pn.u(*"123456789").optimize()
a0_9 = (a1_9 | pn.a("0")).optimize()

f1 = (((a1_9 + pn.t("", "^ ")) | "") + a0_9).optimize()
f2 = ((a1_9 + pn.t("", "^^ ")) + ((a0_9 + pn.t("", "^ "))) + a0_9).optimize()
f = (f2 | f1).optimize()
f = pn.u(f, f + "." + a0_9.plus)

map1_9 = {
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine"
}
Пример #7
0
import pynini as pn
import random

# compose - *
# concat  - +
# union   - |

fst = (pn.a("a") | pn.a("e")) + pn.t("a",
                                     pn.a("0").closure(0, 5)) | pn.t(
                                         pn.a("a").star, "0") + pn.a("xxx")
fst = fst.optimize()

output_strings = set()

for i in range(10000):
    s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify()
    output_strings.add(s)

print(len(output_strings))

for output_string in output_strings:
    print(output_string)


def top_paths(fst, count=100):
    return sorted(
        set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths()))


print("INPUTS")
print("\t")
Пример #8
0
a_1_to_9 = pn.u(*"123456789").optimize()

# Create an acceptor for digits 0..9
a_0_to_9 = (a_1_to_9 | pn.a("0")).optimize()

# First, let's define the factorizer.
# Factorizer converts numbers to their factorized form, using ^ characters
# to denote powers of ten:
#
# 0    -> 0
# 1    -> 1
# 10   -> 1^
# 23   -> 2^ 3
# 203 ->  2^^ 3
# TODO: currently only works for 0..99
factorizer = (((a_1_to_9 + pn.t("", "^ ")) | "") + a_0_to_9).optimize()

# You can debug the factorizer by generating random paths through it
# print(list(pn.randgen(factorizer, 5).paths()))

# Now, let's define number-to-string mappings

map_1_to_9 = {
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",