Exemplo n.º 1
0
    def add_optional_and(self, graph):
        if not self.deterministic:
            graph = pynini.compose(
                graph, NEMO_SIGMA +
                pynini.closure(pynini.cross("hundred ", " "), 0, 1) +
                NEMO_SIGMA)

        not_quote = pynini.closure(NEMO_NOT_QUOTE)
        no_thousand_million = pynini.difference(
            not_quote, not_quote + pynini.union("thousand", "million") +
            not_quote).optimize()
        integer = (not_quote + pynutil.add_weight(
            pynini.cross("hundred ", "hundred and ") + no_thousand_million,
            -0.0001)).optimize()

        no_hundred = pynini.difference(
            NEMO_SIGMA,
            not_quote + pynini.accep("hundred") + not_quote).optimize()
        integer |= (not_quote + pynutil.add_weight(
            pynini.cross("thousand ", "thousand and ") + no_hundred,
            -0.0001)).optimize()

        graph_with_and = pynini.compose(
            graph, integer).optimize() | pynutil.add_weight(graph, 0.00001)

        return graph_with_and
Exemplo n.º 2
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)

        self.graph = cardinal.single_digits_graph.optimize()

        if not deterministic:
            self.graph = self.graph | cardinal_graph

        point = pynutil.delete(".")
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        self.graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        self.graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) +
            point + pynutil.insert(" ") + self.graph_fractional)

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal_graph_hundred_component_at_least_one_none_zero_digit)

        # reduce options for non_deterministic and allow either "oh" or "zero", but not combination
        if not deterministic:
            no_oh_zero = pynini.difference(
                NEMO_SIGMA,
                (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA)
                | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA),
            ).optimize()
            no_zero_oh = pynini.difference(
                NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA +
                pynini.accep("oh") + NEMO_SIGMA).optimize()

            self.final_graph_wo_negative |= pynini.compose(
                self.final_graph_wo_negative,
                pynini.cdrewrite(
                    pynini.cross("integer_part: \"zero\"",
                                 "integer_part: \"oh\""), NEMO_SIGMA,
                    NEMO_SIGMA, NEMO_SIGMA),
            )
            self.final_graph_wo_negative = pynini.compose(
                self.final_graph_wo_negative, no_oh_zero).optimize()
            self.final_graph_wo_negative = pynini.compose(
                self.final_graph_wo_negative, no_zero_oh).optimize()

        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 3
0
def cody_rand_gen_no_duplicate(acceptor, n):
    loop = 50000
    result_set = set()
    seed = 0
    for i in range(loop):
        print('started loop ' + str(i))
        num = int(n + n * i * 0.1)
        temp = pynini.randgen(acceptor,
                              npath=num,
                              seed=seed,
                              select='uniform',
                              max_length=2147483647,
                              weighted=False)
        rand_list = list_string_set(temp)
        result_set = result_set.union(set(rand_list))
        uniq_len = len(result_set)
        if uniq_len < n and i < loop - 1:
            print('insufficient random strings')
            seed += 1
            continue
        else:
            rand_list = list(result_set)
            random.shuffle(rand_list)
            rand_list = rand_list[:n]
            rand_list.sort()
            acceptor = pynini.difference(acceptor, temp)
            print('returning')
            if len(rand_list) >= n:
                print('got full rand_list\n')
            return acceptor, rand_list
Exemplo n.º 4
0
def get_quantity(decimal: 'pynini.FstLike',
                 cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. một triệu -> integer_part: "1" quantity: "triệu"
    e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"

    Args:
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) +
                                        pynini.difference(NEMO_DIGIT, "0") +
                                        pynini.closure(NEMO_DIGIT))
    suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn")
    graph_four = pynini.cross("tư", "4")
    graph_one = pynini.cross("mốt", "1")
    graph_half = pynini.cross("rưỡi", "5")
    optional_fraction_graph = pynini.closure(
        delete_extra_space + pynutil.insert("fractional_part: \"") +
        (graph_digit | graph_half | graph_one | graph_four) +
        pynutil.insert("\""),
        0,
        1,
    )

    res = (pynutil.insert("integer_part: \"") + numbers +
           pynutil.insert("\"") + delete_extra_space +
           pynutil.insert("quantity: \"") + suffix + pynutil.insert("\"") +
           optional_fraction_graph)
    res |= (decimal + delete_extra_space + pynutil.insert("quantity: \"") +
            (suffix | "ngàn" | "nghìn") + pynutil.insert("\""))
    return res
Exemplo n.º 5
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
            whitelist = load_labels(get_abs_path(file))
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case)

        units_graph = _get_whitelist_graph(input_case,
                                           file="data/measurements.tsv")
        # do not replace single letter units, like `м` or `°`
        units_graph = pynini.compose(
            pynini.difference(pynini.project(units_graph, "input"),
                              NEMO_ALPHA), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 6
0
def get_quantity(decimal: 'pynini.FstLike',
                 cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. eine million -> integer_part: "1" quantity: "million"
    e.g. eins komma vier millionen -> integer_part: "1" fractional_part: "4" quantity: "millionen"

    Args: 
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) +
                                        pynini.difference(NEMO_DIGIT, "0") +
                                        pynini.closure(NEMO_DIGIT))
    suffix = pynini.union(
        "million",
        "millionen",
        "milliarde",
        "milliarden",
        "billion",
        "billionen",
        "billiarde",
        "billiarden",
        "trillion",
        "trillionen",
        "trilliarde",
        "trilliarden",
    )
    res = (pynutil.insert("integer_part: \"") + numbers +
           pynutil.insert("\"") + delete_extra_space +
           pynutil.insert("quantity: \"") + suffix + pynutil.insert("\""))
    res |= decimal + delete_extra_space + pynutil.insert(
        "quantity: \"") + suffix + pynutil.insert("\"")
    return res
Exemplo n.º 7
0
def create_data_with_duplicate(filename, pos_dict, neg_dict, min_len, max_len, num, get_difference):
    with open(filename, "w+") as f:
        for i in range(min_len, max_len + 1):
            pos_fsa = \
                pynini.randgen(pos_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False)
            if get_difference == 1:
                pos_dict[i] = pynini.difference(pos_dict[i], pos_fsa)
            for ele in list_string_set(pos_fsa):
                f.write(ele + "\t" + "TRUE\n")
            neg_fsa = \
                pynini.randgen(neg_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False)
            if get_difference == 1:
                neg_dict[i] = pynini.difference(neg_dict[i], neg_fsa)
            for ele in list_string_set(neg_fsa):
                f.write(ele + "\t" + "FALSE\n")
    return pos_dict, neg_dict
Exemplo n.º 8
0
Arquivo: word.py Projeto: NVIDIA/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
        graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )

        if not deterministic:
            phoneme = (
                pynini.accep(pynini.escape("["))
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.closure(phoneme_unit + pynini.accep(" "))
                + phoneme_unit
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.accep(pynini.escape("]"))
            )
        self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemplo n.º 9
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False):
        super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

        one_thousand_alternative = alternative_formats['one_thousand_alternative']
        separators = alternative_formats['separators']

        ordinal = number_names['ordinal_number_names']

        ordinal |= ordinal @ one_thousand_alternative
        ordinal_numbers = separators @ ordinal

        # to handle cases like 2-ая
        endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv"))
        not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-"))
        del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA)
        ordinal_numbers_marked = (
            ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize()
            @ (NEMO_SIGMA + endings).optimize()
            @ del_ending
        ).optimize()

        self.ordinal_numbers = ordinal_numbers
        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize()

        final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize()
        final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 10
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        dot = pynini.accep(".")
        # A.B.C. -> A. B. C.
        graph = NEMO_UPPER + dot + pynini.closure(
            insert_space + NEMO_UPPER + dot, 1)
        # A.B.C. -> A.B.C.
        graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1)
        # ABC -> ABC
        graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1)
        # ABC -> A B C
        graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)

        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemplo n.º 11
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)
        misc_graph = pynutil.add_weight(
            TO_LOWER +
            pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)),
            110)
        misc_graph |= pynutil.add_weight(
            pynini.closure(NEMO_UPPER, 2) +
            pynini.closure(insert_space + NEMO_LOWER, 1), 110)
        misc_graph |= (
            NEMO_UPPER + pynutil.delete(".") +
            pynini.closure(insert_space + NEMO_UPPER + pynutil.delete(".")))
        misc_graph |= pynutil.add_weight(
            TO_LOWER + pynutil.delete(".") +
            pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110)

        # set weight of the misc graph to the value higher then word
        graph = pynutil.add_weight(main_graph.optimize(),
                                   10) | pynutil.add_weight(
                                       misc_graph.optimize(), 101)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)
        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemplo n.º 12
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))

        if input_file:
            graph = _get_whitelist_graph(input_case, input_file)

        units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv"))
        # do not replace single letter units, like `м`, `°` and `%` will be replaced
        units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
Exemplo n.º 13
0
def alternate_rand_gen_no_duplicate(acceptor, n):
    rand_list = []
    loop = 10
    seed = 0
    for i in range(loop):
        print('(alternate) trying to generate random strings (' + str(i) + ')')
        num = int(n + n * i * .01)
        temp = pynini.randgen(acceptor,
                              npath=num,
                              seed=seed,
                              select='uniform',
                              max_length=2147483647,
                              weighted=False)
        print('made new `temp`')
        temp_list = list_string_set(temp)
        print('temp got ' + str(len(temp_list)) + ' random strings')
        temp_list = list(set(temp_list))
        new_strings = [t for t in temp_list if t not in rand_list]
        print('got ' + str(len(new_strings)) + ' new strings')
        for t in temp_list:
            if t not in rand_list:
                rand_list.append(t)
                if len(rand_list) == n:
                    print('rand_list now has ' + str(len(rand_list)) +
                          ' strings')
                    print('finally got enough strings in rand_list; i=' +
                          str(i))
                    return acceptor, rand_list
        acceptor = pynini.difference(acceptor, temp)
        seed += 1
        print('rand_list now has ' + str(len(rand_list)) + ' strings')
        print('need to add strings to rand_list (' + str(i) + ')')
    print('finished loop; returning incomplete set')
    return acceptor, rand_list
Exemplo n.º 14
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word",
                         kind="classify",
                         deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(
            pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph,
                    pynini.union("$", "€", "₩", "£", "¥") +
                    pynini.closure(NEMO_DIGIT, 1)), 1)

        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 15
0
def getNegString(fsa, min_len, max_len):
    fsa_dict = {}
    neg_str_dict = {}
    for i in range(min_len, max_len + 1):
        fsa_dict[i] = pynini.difference(pynini.closure(sigma, i, i), fsa)
        neg_str_dict[i] = list(
            np.random.permutation(listStringSet(fsa_dict[i])))
        print(neg_str_dict[i])
    return neg_str_dict
Exemplo n.º 16
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        single_digits_graph = pynutil.add_weight(
            pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight(
                pynini.cross("0", "oh"), 1.1)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            pynutil.insert(" ") + single_digits_graph)

        if not deterministic:
            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + pynutil.insert(" "), 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph +
                    pynutil.insert(" ") + single_digits_graph,
                    1,
                )
            self.graph |= self.single_digits_graph | get_hundreds_graph(
            ) | single_digits_graph_with_commas
            self.range_graph = (
                pynini.closure(pynutil.insert("from "), 0, 1) + self.graph +
                (pynini.cross("-", " to ") | pynini.cross("-", " ")) +
                self.graph)

            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph = self.range_graph.optimize()

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)
        final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(),
                                                      1.2)

        if not deterministic:
            final_graph |= self.range_graph

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 17
0
    def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True):
        super().__init__(name="measure", kind="verbalize", deterministic=deterministic)
        optional_sign = cardinal.optional_sign
        unit = (
            pynutil.delete("units: \"")
            + pynini.difference(pynini.closure(NEMO_CHAR - " ", 1), pynini.union("address", "math"))
            + pynutil.delete("\"")
            + delete_space
        )

        graph_decimal = (
            pynutil.delete("decimal {")
            + delete_space
            + optional_sign
            + delete_space
            + decimal.numbers
            + delete_space
            + pynutil.delete("}")
        )
        graph_cardinal = (
            pynutil.delete("cardinal {")
            + delete_space
            + optional_sign
            + delete_space
            + cardinal.numbers
            + delete_space
            + pynutil.delete("}")
        )

        graph_fraction = (
            pynutil.delete("fraction {") + delete_space + fraction.graph + delete_space + pynutil.delete("}")
        )

        graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + insert_space + unit

        # SH adds "preserve_order: true" by default
        preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
        graph |= unit + insert_space + (graph_cardinal | graph_decimal) + delete_space + pynini.closure(preserve_order)
        address = (
            pynutil.delete("units: \"address\" ")
            + delete_space
            + graph_cardinal
            + delete_space
            + pynini.closure(preserve_order)
        )
        math = (
            pynutil.delete("units: \"math\" ")
            + delete_space
            + graph_cardinal
            + delete_space
            + pynini.closure(preserve_order)
        )
        graph |= address | math

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 18
0
def border(fsa, n):
    cofsa = pynini.difference(sigmaStar, fsa)
    cofsa.optimize()
    bpairs = fsa @ editExactly1 @ cofsa  # this is the key insight which gives entire border
    bpairs.optimize()
    sigmaN = pynini.closure(sigma, n, n)
    sigmaN.optimize()
    bpairsN = sigmaN @ bpairs  # here we limit the border to input words of length=n
    bpairsN.optimize()
    return bpairsN
Exemplo n.º 19
0
def make_byte_star_except_boundary(
        boundary: pynini.FstLike = "+") -> pynini.Fst:
    """Helper function to make sigma-star over bytes, minus the boundary symbol.

  Args:
    boundary: a string, the boundary symbol to use.

  Returns:
    An acceptor representing sigma-star over bytes, minus the boundary symbol.
  """
    return pynini.difference(byte.BYTE, boundary).closure().optimize()
Exemplo n.º 20
0
def get_quantity(deci):
    numbers = cardinal.graph_hundred_component_at_least_one_none_zero_digit @ (
        pynutil.delete(pynini.closure("0")) +
        pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT))
    suffix = pynini.union("million", "billion", "trillion", "quadrillion",
                          "quintillion", "sextillion")
    res = (pynutil.insert("integer_part: \"") + numbers +
           pynutil.insert("\"") + delete_extra_space +
           pynutil.insert("quantity: \"") + suffix + pynutil.insert("\""))
    res |= deci + delete_extra_space + pynutil.insert("quantity: \"") + (
        suffix | "thousand") + pynutil.insert("\"")
    return res
Exemplo n.º 21
0
def border(fsa, n):
    '''
    A function that takes an fsa and produces an fst;
    the fst converts strings of length n in the language to "border" strings,
    which are 1 edit off from being in the language
    '''
    cofsa = pynini.difference(sigmaStar, fsa)
    cofsa.optimize()
    bpairs = fsa @ editExactly1 @ cofsa  # this is the key insight which gives entire border
    bpairs.optimize()
    sigmaN = pynini.closure(sigma, n, n)
    sigmaN.optimize()
    bpairsN = sigmaN @ bpairs  # here we limit the border to input words of length=n
    bpairsN.optimize()
    return bpairsN
Exemplo n.º 22
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1)
                ),
                1,
            )

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )
        self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemplo n.º 23
0
def rand_gen_no_duplicate(acceptor, n):
    loop = 10
    for i in range(loop):
        num = int(n + n*i*0.1)
        temp = pynini.randgen(acceptor, npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False)
        rand_list = list_string_set(temp)
        rand_list = list(set(rand_list))
        uniq_len = len(rand_list)
        if uniq_len < n and i < loop - 1:
            print('insufficient random strings')
            continue
        else:
            random.shuffle(rand_list)
            rand_list = rand_list[:n]
            rand_list.sort()
            acceptor = pynini.difference(acceptor, temp)
            return acceptor, rand_list
Exemplo n.º 24
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")

        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 25
0
def get_quantity(
        decimal: 'pynini.FstLike',
        cardinal_up_to_thousand: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. one million -> integer_part: "1" quantity: "million"
    e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"

    Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions 

    Args: 
        decimal: decimal FST
        cardinal_up_to_million: cardinal FST
    """
    numbers = cardinal_up_to_thousand @ (pynutil.delete(pynini.closure("0")) +
                                         pynini.difference(NEMO_DIGIT, "0") +
                                         pynini.closure(NEMO_DIGIT))

    suffix = pynini.union(
        "million",
        "millions",
        "milliard",
        "milliards",
        "billion",
        "billions",
        "billiard",
        "billiards",
        "trillion",
        "trillions",
        "trilliard",
        "trilliards",
    )
    res = (
        pynutil.insert("integer_part: \"") + numbers + pynutil.insert("\"") + (
            pynini.union(delete_hyphen, delete_extra_space)
        )  # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.
        + pynutil.insert(" quantity: \"") + suffix + pynutil.insert("\""))
    res |= decimal + delete_extra_space + pynutil.insert(
        " quantity: \"") + suffix + pynutil.insert("\"")
    return res
Exemplo n.º 26
0
def get_quantity(decimal: "pynini.FstLike",
                 cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike":
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. một triệu -> integer_part: "1" quantity: "triệu"
    e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"

    Args:
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) +
                                        pynini.difference(NEMO_DIGIT, "0") +
                                        pynini.closure(NEMO_DIGIT))
    suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn")
    graph_four = pynini.cross("tư", "4")
    graph_one = pynini.cross("mốt", "1")
    graph_half = pynini.cross("rưỡi", "5")
    last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
    last_digit = pynini.union(
        (pynini.project(graph_digit, "input") - last_digit_exception.arcsort())
        @ graph_digit,
        graph_one,
        graph_four,
        graph_half,
    )
    optional_fraction_graph = pynini.closure(
        delete_extra_space + pynutil.insert('fractional_part: "') +
        (last_digit | graph_half | graph_one | graph_four) +
        pynutil.insert('"'),
        0,
        1,
    )

    res = (pynutil.insert('integer_part: "') + numbers + pynutil.insert('"') +
           delete_extra_space + pynutil.insert('quantity: "') + suffix +
           pynutil.insert('"') + optional_fraction_graph)
    res |= (decimal + delete_extra_space + pynutil.insert('quantity: "') +
            (suffix | "ngàn" | "nghìn") + pynutil.insert('"'))
    return res
Exemplo n.º 27
0
    def __init__(self, ordinal: GraphFst, deterministic: bool = True):
        super().__init__(name="date",
                         kind="verbalize",
                         deterministic=deterministic)

        day_cardinal = pynutil.delete("day: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        day = day_cardinal @ pynini.cdrewrite(
            ordinal.ordinal_stem, "", "[EOS]",
            NEMO_SIGMA) + pynutil.insert("ter")

        months_names = pynini.union(*[
            x[1]
            for x in load_labels(get_abs_path("data/months/abbr_to_name.tsv"))
        ])
        month = pynutil.delete("month: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        final_month = month @ months_names
        final_month |= month @ pynini.difference(
            NEMO_SIGMA, months_names) @ pynini.cdrewrite(
                ordinal.ordinal_stem, "", "[EOS]",
                NEMO_SIGMA) + pynutil.insert("ter")

        year = pynutil.delete("year: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

        # day month year
        graph_dmy = day + pynini.accep(" ") + final_month + pynini.closure(
            pynini.accep(" ") + year, 0, 1)
        graph_dmy |= final_month + pynini.accep(" ") + year

        self.graph = graph_dmy | year
        final_graph = self.graph + delete_preserve_order

        delete_tokens = self.delete_tokens(final_graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 28
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=deterministic)
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            deterministic_ordinal = OrdinalFst(cardinal=cardinal,
                                               deterministic=True)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal,
                                 deterministic=deterministic)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=deterministic,
                                   cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=deterministic)
            measure_graph = measure.fst
            date_graph = DateFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal,
                                 deterministic=deterministic).fst
            telephone_graph = TelephoneFst(deterministic=deterministic).fst
            electronic_graph = ElectronicFst(deterministic=deterministic).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=deterministic).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=deterministic,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=deterministic).graph
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=deterministic_ordinal,
                                     deterministic=deterministic).fst

            # VERBALIZERS
            cardinal = vCardinal(deterministic=deterministic)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=deterministic)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=deterministic)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=deterministic)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=deterministic).fst
            v_electronic_graph = vElectronic(deterministic=deterministic).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=deterministic)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=deterministic).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst
            v_abbreviation = vAbbreviation(deterministic=deterministic).fst

            det_v_time_graph = vTime(deterministic=True).fst
            det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True),
                                     deterministic=True).fst
            time_final = pynini.compose(time_graph, det_v_time_graph)
            date_final = pynini.compose(date_graph, det_v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   date=date_final,
                                   cardinal=CardinalFst(deterministic=True),
                                   deterministic=deterministic).fst
            v_word_graph = vWord(deterministic=deterministic).fst

            sem_w = 1
            word_w = 100
            punct_w = 2
            classify_and_verbalize = (
                pynutil.add_weight(whitelist_graph, sem_w)
                | pynutil.add_weight(pynini.compose(time_graph, v_time_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(cardinal_graph, v_cardinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph),
                    sem_w)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), sem_w)
                | pynutil.add_weight(word_graph, word_w)
                | pynutil.add_weight(pynini.compose(date_graph, v_date_graph),
                                     sem_w - 0.01)
                | pynutil.add_weight(pynini.compose(range_graph, v_word_graph),
                                     sem_w)
                | pynutil.add_weight(
                    pynini.compose(serial_graph, v_word_graph),
                    1.1001)  # should be higher than the rest of the classes
            ).optimize()

            if not deterministic:
                roman_graph = RomanFst(deterministic=deterministic).fst
                # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(roman_graph, v_roman_graph), word_w)

                abbreviation_graph = AbbreviationFst(
                    whitelist=whitelist, deterministic=deterministic).fst
                classify_and_verbalize |= pynutil.add_weight(
                    pynini.compose(abbreviation_graph, v_abbreviation), word_w)

            punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct_only),
                1,
            )

            token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) +
                                classify_and_verbalize +
                                pynini.closure(pynutil.insert(" ") + punct))

            graph = token_plus_punct + pynini.closure(
                (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                delete_extra_space)
                 | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                token_plus_punct)

            graph |= punct_only + pynini.closure(punct)
            graph = delete_space + graph + delete_space

            remove_extra_spaces = pynini.closure(
                NEMO_NOT_SPACE,
                1) + pynini.closure(delete_extra_space +
                                    pynini.closure(NEMO_NOT_SPACE, 1))
            remove_extra_spaces |= (
                pynini.closure(pynutil.delete(" "), 1) +
                pynini.closure(NEMO_NOT_SPACE, 1) +
                pynini.closure(delete_extra_space +
                               pynini.closure(NEMO_NOT_SPACE, 1)))

            graph = pynini.compose(graph.optimize(),
                                   remove_extra_spaces).optimize()
            self.fst = graph
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(graph, no_digits).optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Exemplo n.º 29
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        graph_hundred = pynini.cross("hundred", "")

        graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0"))
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_teen | pynutil.insert("00"),
            (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
        )

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
        )
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit
        )

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
            pynutil.insert("000", weight=0.1),
        )

        graph_million = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"),
            pynutil.insert("000", weight=0.1),
        )
        graph_billion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_trillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quadrillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quintillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_sextillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_sextillion
            + delete_space
            + graph_quintillion
            + delete_space
            + graph_quadrillion
            + delete_space
            + graph_trillion
            + delete_space
            + graph_billion
            + delete_space
            + graph_million
            + delete_space
            + graph_thousands
            + delete_space
            + graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
        )

        labels_exception = [num_to_word(x) for x in range(0, 13)]
        graph_exception = pynini.union(*labels_exception)

        graph = (
            pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA)
            @ (NEMO_ALPHA + NEMO_SIGMA)
            @ graph
        )

        self.graph_no_exception = graph

        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1
        )

        final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 30
0
    from pynini.examples import plurals
    from pynini.lib import byte, pynutil, utf8

    NEMO_CHAR = utf8.VALID_UTF8_CHAR

    NEMO_DIGIT = byte.DIGIT
    NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
    NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
    NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
    NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
    NEMO_HEX = pynini.union(*string.hexdigits).optimize()
    NEMO_NON_BREAKING_SPACE = u"\u00A0"
    NEMO_SPACE = " "
    NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r",
                                    u"\u00A0").optimize()
    NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
    NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()

    NEMO_PUNCT = pynini.union(
        *map(pynini.escape, string.punctuation)).optimize()
    NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

    NEMO_SIGMA = pynini.closure(NEMO_CHAR)

    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",