예제 #1
0
def generate_fst_digit():
    fst_dict = {}

    fst_single_digit = generate_fst_for_factor_digit(0, True)

    for factor in range(0, 10):
        fst_dict[factor] = generate_fst_for_factor_digit(factor)

    fst = pn.a("")
    for num_places in range(1, 10):
        fst_for_x_digit_num = pn.a("")

        for num_place in range(num_places, 0, -1):
            if num_places == 1:
                fst_for_x_digit_num = pn.concat(fst_for_x_digit_num,
                                                fst_single_digit)
            else:
                fst_for_x_digit_num = pn.concat(fst_for_x_digit_num,
                                                fst_dict[num_place - 1])

        fst = pn.union(fst, fst_for_x_digit_num)

    comma_numbers = pn.u(".", ",") + pn.u(*"0123456789").star
    fst = fst + pn.u("", comma_numbers)
    fst = fst.optimize()
    return fst
예제 #2
0
def n2w_fst():
    factor_fst = generate_fst_digit()

    # full french alphabet - https://en.wikiversity.org/wiki/French/Alphabet
    alphabet_full = pn.u(
        *".0123456789^ _-abcdefghijklmnopqrstuvwxyzàèùéâêîôûëïüÿæœç").star
    fsa_0_9 = pn.u(*"0123456789").star

    # single_zero = pn.t("0", "zéro")
    single_zero = pn.t("0", "zero")

    single_digits = pn.string_map({
        "0": "",  # zéro
        "1": "un",
        "2": "deux",
        "3": "trois",
        "4": "quatre",
        "5": "cinq",
        "6": "six",
        "7": "sept",
        "8": "huit",
        "9": "neuf",
    })

    zeros = pn.string_map({
        # "0^^ 0^ 0": "",
        "0^ ": "",
        "0^^ ": "",
        "mille_0^^ 0^ 0": "mille",
    })

    teens_10_19 = pn.string_map({
        "1^ 0": "dix",
        "1^ 1": "onze",
        "1^ 2": "douze",
        "1^ 3": "treize",
        "1^ 4": "quatorze",
        "1^ 5": "quinze",
        "1^ 6": "seize",
        "1^ 7": "dix-sept",
        "1^ 8": "dix-huit",
        "1^ 9": "dix-neuf",
    })

    mult_20_60 = pn.string_map({
        "2^ 0": "vingt",
        "2^ 1": "vingt_et_un",
        "3^ 0": "trente",
        "3^ 1": "trente_et_un",
        "4^ 0": "quarante",
        "4^ 1": "quarante_et_un",
        "5^ 0": "cinquante",
        "5^ 1": "cinquante_et_un",
        "6^ 0": "soixante",
        "6^ 1": "soixante_et_un",
    })

    mult_2x_6x = pn.string_map({
        "2^ ": "vingt-",
        "3^ ": "trente-",
        "4^ ": "quarante-",
        "5^ ": "cinquante-",
        "6^ ": "soixante-",
    })

    mult_70_90 = pn.string_map({
        "7^ 0": "soixante-dix",
        "7^ 1": "soixante_et_onze",
        "7^ 2": "soixante-douze",
        "7^ 3": "soixante-treize",
        "7^ 4": "soixante-quatorze",
        "7^ 5": "soixante-quinze",
        "7^ 6": "soixante-seize",
        "7^ 7": "soixante-dix-sept",
        "7^ 8": "soixante-dix-huit",
        "7^ 9": "soixante-dix-neuf",
        "8^ 0": "quatre-vingts",
        "9^ 0": "quatre-vingt-dix",
        "9^ 1": "quatre-vingt-onze",
        "9^ 2": "quatre-vingt-douze",
        "9^ 3": "quatre-vingt-treize",
        "9^ 4": "quatre-vingt-quatorze",
        "9^ 5": "quatre-vingt-quinze",
        "9^ 6": "quatre-vingt-seize",
        "9^ 7": "quatre-vingt-dix-sept",
        "9^ 8": "quatre-vingt-dix-huit",
        "9^ 9": "quatre-vingt-dix-neuf",
    })

    mult_8x = pn.string_map({
        "8^ ": "quatre-vingt-",
    })

    hundreds_alone = pn.string_map({
        "1^^ 0^ 0": "cent",
        "2^^ 0^ 0": "deux_cents",
        "3^^ 0^ 0": "trois_cents",
        "4^^ 0^ 0": "quatre_cents",
        "5^^ 0^ 0": "cinq_cents",
        "6^^ 0^ 0": "six_cents",
        "7^^ 0^ 0": "sept_cents",
        "8^^ 0^ 0": "huit_cents",
        "9^^ 0^ 0": "neuf_cents",
    })

    hundreds = pn.string_map({
        "1^^ ": "cent_",
        "2^^ ": "deux_cent_",
        "3^^ ": "trois_cent_",
        "4^^ ": "quatre_cent_",
        "5^^ ": "cinq_cent_",
        "6^^ ": "six_cent_",
        "7^^ ": "sept_cent_",
        "8^^ ": "huit_cent_",
        "9^^ ": "neuf_cent_",
    })

    mille = pn.string_map({
        "0^^^ ": "0^^^_mille_",
        "1^^^ ": "1^^^_mille_",
        "2^^^ ": "2^^^_mille_",
        "3^^^ ": "3^^^_mille_",
        "4^^^ ": "4^^^_mille_",
        "5^^^ ": "5^^^_mille_",
        "6^^^ ": "6^^^_mille_",
        "7^^^ ": "7^^^_mille_",
        "8^^^ ": "8^^^_mille_",
        "9^^^ ": "9^^^_mille_",
    })

    million = pn.string_map({
        "0^^^^^^ ": "0^^^^^^_millions_",
        "1^^^^^^ ": "1^^^^^^_millions_",
        "2^^^^^^ ": "2^^^^^^_millions_",
        "3^^^^^^ ": "3^^^^^^_millions_",
        "4^^^^^^ ": "4^^^^^^_millions_",
        "5^^^^^^ ": "5^^^^^^_millions_",
        "6^^^^^^ ": "6^^^^^^_millions_",
        "7^^^^^^ ": "7^^^^^^_millions_",
        "8^^^^^^ ": "8^^^^^^_millions_",
        "9^^^^^^ ": "9^^^^^^_millions_",
    })

    strip_triple_factor = pn.string_map({
        "^^^^^^^^": "^^",
        "^^^^^^^": "^",
        "^^^^^^": "",
        "^^^^^": "^^",
        "^^^^": "^",
        "^^^": "",
    })

    un_mille_million = pn.string_map({
        "un_mille": "mille",
        "un_millions": "un_million",
    })

    fixmeup = pn.string_map({
        # "zzzzz" : "xxxxxx",
        "_cent__millions__mille": "_cents_millions",
        "millions_un_mille": "millions_mille",
        # "million--mille": "million",
        "millions__mille": "millions",
        "vingts_mille": "vingt_mille",
        "cent__mille": "cent_mille",
    })

    fixmeup2 = pn.string_map({
        "million__mille": "million",
        "_cent__millions": "_cents_millions",
        "million_un_mille": "million_mille",
        "__": "_",
    })

    decimals = pn.string_map({
        # "0": "zéro ",  # zéro
        "0": "zero ",  # zéro
        "1": "un ",
        "2": "deux ",
        "3": "trois ",
        "4": "quatre ",
        "5": "cinq ",
        "6": "six ",
        "7": "sept ",
        "8": "huit ",
        "9": "neuf ",
        "_": " ",
    })

    fsa_eos = pn.a("[EOS]")
    fsa_bos = pn.a("[BOS]")
    fsa_dot_comma = pn.u(".", ",")

    fst_dot_comma = pn.cdrewrite(
        pn.u(pn.t(".", " virgule "), pn.t(",", " virgule ")), "", "",
        alphabet_full)

    fst_decimals = pn.cdrewrite(decimals, "", "", alphabet_full)

    fst_zeros = pn.cdrewrite(zeros, "", fsa_0_9 | fsa_eos | fsa_dot_comma,
                             alphabet_full)

    fst_single_zero = pn.cdrewrite(single_zero, "", fsa_eos | fsa_dot_comma,
                                   alphabet_full)

    fst_single_digits = pn.cdrewrite(single_digits, "",
                                     pn.u(fsa_eos, "-", "_", fsa_dot_comma),
                                     alphabet_full)

    fst_teens = pn.cdrewrite(teens_10_19, "", "", alphabet_full)

    fst_mult_20_60 = pn.cdrewrite(mult_20_60, "", "", alphabet_full)
    fst_mult_2x_6x = pn.cdrewrite(mult_2x_6x, "", fsa_0_9, alphabet_full)

    fst_mult_70_90 = pn.cdrewrite(mult_70_90, "", "", alphabet_full)
    fst_mult_8x = pn.cdrewrite(mult_8x, "", fsa_0_9, alphabet_full)

    fst_hundreds_alone = pn.cdrewrite(hundreds_alone, "", fsa_eos,
                                      alphabet_full)
    fst_hundreds = pn.cdrewrite(hundreds, "", fsa_0_9, alphabet_full)

    fst_mille = pn.cdrewrite(mille, "", fsa_0_9, alphabet_full)
    fst_million = pn.cdrewrite(million, "", fsa_0_9, alphabet_full)

    fst_strip_triple_factor = pn.cdrewrite(strip_triple_factor, fsa_0_9,
                                           pn.u(" ", "-", "_"), alphabet_full)

    fst_un_mille_million = pn.cdrewrite(un_mille_million, fsa_bos, "",
                                        alphabet_full)

    fst_fixmeup = pn.cdrewrite(fixmeup, "", "", alphabet_full)
    fst_fixmeup2 = pn.cdrewrite(fixmeup2, "", "", alphabet_full)

    fst = factor_fst * fst_million * fst_mille * fst_strip_triple_factor * \
        fst_hundreds_alone * fst_hundreds * \
        fst_mult_70_90 * fst_mult_8x * fst_mult_20_60 * fst_mult_2x_6x * \
        fst_teens * fst_zeros * fst_single_zero * fst_single_digits * \
        fst_un_mille_million * fst_fixmeup * fst_fixmeup2 * \
        fst_dot_comma * fst_decimals

    transformer = fst.optimize()

    ## ---------- YOUR PART ENDS------------
    return transformer
예제 #3
0
import pynini as pn
import random

# compose - *
# concat  - +
# union   - |

fst = (pn.a("a") | pn.a("e")) + pn.t("a",
                                     pn.a("0").closure(0, 5)) | pn.t(
                                         pn.a("a").star, "0") + pn.a("xxx")
fst = fst.optimize()

output_strings = set()

for i in range(10000):
    s = pn.randgen(fst, 1, random.randint(0, 100000)).stringify()
    output_strings.add(s)

print(len(output_strings))

for output_string in output_strings:
    print(output_string)


def top_paths(fst, count=100):
    return sorted(
        set(p[1] for p in pn.shortestpath(fst, nshortest=count).paths()))


print("INPUTS")
print("\t")
예제 #4
0
# TODO: define the similar mappings for teens (10..19) and tens (20, 30, 40, etc)
# map_10_to_19
# map_20_to_90

# Now, define a FST that uses the mapper FSTs to transform factorized form to
# verbalized form:
# 0    -> zero
# 1^   -> ten
# 1^ 1 -> eleven
# 9^ 1 -> ninety one
# 1^^ 9^ 1 -> ['one hundred ninety one', 'hundred ninety one']
# TODO: currently only works for single digits (and doesn't work for zero)

a1_9 = pn.u(*"123456789").optimize()
a0_9 = (a1_9 | pn.a("0")).optimize()

f1 = (((a1_9 + pn.t("", "^ ")) | "") + a0_9).optimize()
f2 = ((a1_9 + pn.t("", "^^ ")) + ((a0_9 + pn.t("", "^ "))) + a0_9).optimize()
f = (f2 | f1).optimize()
f = pn.u(f, f + "." + a0_9.plus)

map1_9 = {
    "1": "one",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
예제 #5
0
import pynini as pn
import sys


# Helper function to return all outputs of the fiven fst in sorted order
def sorted_outputs(fst):
    return sorted([p[1] for p in fst.paths()])


# Create an acceptor for digits 1..9
a_1_to_9 = pn.u(*"123456789").optimize()

# Create an acceptor for digits 0..9
a_0_to_9 = (a_1_to_9 | pn.a("0")).optimize()

# First, let's define the factorizer.
# Factorizer converts numbers to their factorized form, using ^ characters
# to denote powers of ten:
#
# 0    -> 0
# 1    -> 1
# 10   -> 1^
# 23   -> 2^ 3
# 203 ->  2^^ 3
# TODO: currently only works for 0..99
factorizer = (((a_1_to_9 + pn.t("", "^ ")) | "") + a_0_to_9).optimize()

# You can debug the factorizer by generating random paths through it
# print(list(pn.randgen(factorizer, 5).paths()))

# Now, let's define number-to-string mappings