示例#1
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    # Adapted from
    # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm
    # Specifies common ways of delimiting thousands in digit strings.
    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats[
        'one_thousand_alternative'] = one_thousand_alternative.optimize()
    alternative_formats['separators'] = separators.optimize()
    return alternative_formats
示例#2
0
    def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None):
        super().__init__(name="whitelist", kind="classify", deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv"))

        if input_file:
            graph = _get_whitelist_graph(input_case, input_file)

        units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv"))
        # do not replace single letter units, like `м`, `°` and `%` will be replaced
        units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
示例#3
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = False):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        integer_part = cardinal.cardinal_numbers_default
        cardinal_numbers_with_leading_zeros = cardinal.cardinal_numbers_with_leading_zeros

        delimiter_map = prepare_labels_for_insertion(
            get_abs_path("data/numbers/decimal_delimiter.tsv"))
        delimiter = (
            pynini.cross(",", "") + delimiter_map['@@decimal_delimiter@@'] +
            pynini.closure(pynutil.add_weight(pynutil.insert(" и"), 0.5), 0,
                           1)).optimize()

        decimal_endings_map = prepare_labels_for_insertion(
            get_abs_path("data/numbers/decimal_endings.tsv"))

        self.integer_part = integer_part + delimiter
        graph_integer = pynutil.insert(
            "integer_part: \"") + self.integer_part + pynutil.insert("\"")

        graph_fractional = NEMO_DIGIT @ cardinal_numbers_with_leading_zeros + decimal_endings_map[
            '10']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['100']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['1000']
        graph_fractional |= (
            NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT
        ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['10000']

        self.optional_quantity = pynini.string_file(
            get_abs_path("data/numbers/quantity.tsv")).optimize()

        self.graph_fractional = graph_fractional
        graph_fractional = pynutil.insert(
            "fractional_part: \"") + graph_fractional + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space)
            + pynutil.insert("quantity: \"") + self.optional_quantity +
            pynutil.insert("\""),
            0,
            1,
        )
        self.final_graph = (cardinal.optional_graph_negative + graph_integer +
                            insert_space + graph_fractional +
                            optional_quantity)

        self.final_graph = self.add_tokens(self.final_graph)
        self.fst = self.final_graph.optimize()
示例#4
0
文件: ordinal.py 项目: NVIDIA/NeMo
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False):
        super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

        one_thousand_alternative = alternative_formats['one_thousand_alternative']
        separators = alternative_formats['separators']

        ordinal = number_names['ordinal_number_names']

        ordinal |= ordinal @ one_thousand_alternative
        ordinal_numbers = separators @ ordinal

        # to handle cases like 2-ая
        endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv"))
        not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-"))
        del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA)
        ordinal_numbers_marked = (
            ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize()
            @ (NEMO_SIGMA + endings).optimize()
            @ del_ending
        ).optimize()

        self.ordinal_numbers = ordinal_numbers
        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize()

        final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize()
        final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
示例#5
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
     else:
         whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
示例#6
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all")
        self.cardinal_numbers_nominative = self.get_cardinal_numbers(
            number_names, alternative_formats, mode="nominative"
        )
        self.optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1
        )

        self.cardinal_numbers_with_optional_negative = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_default
            + pynutil.insert("\"")
        )

        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize()

        # "123" -> "один два три"
        single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative)
        self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)

        optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize()
        optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1
        )

        serial_graph = self.get_serial_graph()

        final_graph = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_with_leading_zeros
            + pynutil.insert("\"")
            + optional_quantity
        ).optimize()

        final_graph = pynutil.add_weight(final_graph, -0.1)
        final_graph |= (
            pynutil.insert("integer: \"")
            + pynutil.add_weight(self.single_digits_graph | serial_graph, 10)
            + pynutil.insert("\"")
        )
        self.final_graph = final_graph

        # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings)
        final_graph |= pynini.compose(
            pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph),
            NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA,
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
示例#7
0
 def __init__(self, deterministic: bool = True):
     super().__init__(name="whitelist",
                      kind="classify",
                      deterministic=deterministic)
     whitelist = pynini.string_file(
         get_abs_path("data/whitelist.tsv")).invert()
     graph = pynutil.insert("name: \"") + convert_space(
         whitelist) + pynutil.insert("\"")
     self.fst = graph.optimize()
示例#8
0
def get_alternative_formats():
    """
    Utils to get alternative formats for numbers.
    """
    one_alternatives = load_labels(
        get_abs_path('data/numbers/cardinals_alternatives.tsv'))
    one_thousand_map = []
    for k in one_alternatives:
        default, alternative = k
        one_thousand_map.append((alternative.split()[1], alternative))
    one_thousand_map = pynini.string_map(one_thousand_map)

    one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "",
                                                NEMO_SIGMA)

    t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far'))
    separators = (pynutil.add_weight(t['dot_thousands'], 0.1)
                  | pynutil.add_weight(t['no_delimiter'], -0.1)
                  | pynutil.add_weight(t['space_thousands'], 0.1))
    alternative_formats = {}
    alternative_formats['one_thousand_alternative'] = one_thousand_alternative
    alternative_formats['separators'] = separators
    return alternative_formats
示例#9
0
def get_number_names():
    """
    Creates numbers names.

    Based on: 1) Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
    Transactions of the Association for Computational Linguistics 4: 507-519.
    and 2) Ng, A. H., Gorman, K., and Sproat, R. 2017.
    Minimally supervised written-to-spoken text normalization. In ASRU, pages 665-670.
    """
    a = pynini.Far(get_abs_path('data/utils/util_arithmetic.far'), mode='r')
    d = a['DELTA_STAR']
    f = a['IARITHMETIC_RESTRICTED']
    g = pynini.Fst.read(get_abs_path('data/utils/g.fst'))
    fg = (d @ (f @ (f @ (f @ g).optimize()).optimize()).optimize()).optimize()
    assert rewrite.top_rewrite("230", fg) == "(+ 200 30 +)"

    # Compiles lexicon transducers (L).
    cardinal_name_nominative = pynini.string_file(
        get_abs_path("data/numbers/1_cardinals_nominative_именительный.tsv")
    ).optimize()
    cardinal_name_genitive = pynini.string_file(
        get_abs_path(
            "data/numbers/2_cardinals_genitive_родительный.tsv")).optimize()
    cardinal_name_dative = pynini.string_file(
        get_abs_path(
            "data/numbers/3_cardinals_dative_датильный.tsv")).optimize()
    cardinal_name_accusative = pynini.string_file(
        get_abs_path(
            "data/numbers/4_cardinals_accusative_винительный.tsv")).optimize()
    cardinal_name_instrumental = pynini.string_file(
        get_abs_path("data/numbers/5_cardinals_instrumental_творительный.tsv")
    ).optimize()
    cardinal_name_prepositional = pynini.string_file(
        get_abs_path("data/numbers/6_cardinals_prepositional_предложный.tsv")
    ).optimize()

    cardinal_l = (
        pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
        cardinal_name_nominative).optimize()
    for case in [
            cardinal_name_genitive,
            cardinal_name_dative,
            cardinal_name_accusative,
            cardinal_name_instrumental,
            cardinal_name_prepositional,
    ]:
        cardinal_l |= (pynini.closure(case + pynini.accep(" ")) +
                       case).optimize()

    # Numbers up to 1000 in nominative case (to use, for example, with telephone)
    nominative_up_to_thousand_name = pynini.string_file(
        get_abs_path("data/numbers/cardinals_nominative_case.tsv"))
    nominative_up_to_thousand_name_l = (
        pynini.closure(nominative_up_to_thousand_name + pynini.accep(" ")) +
        nominative_up_to_thousand_name).optimize()

    # Convert e.g. "(* 5 1000 *)" back to  "5000" so complex ordinals will be formed correctly,
    #  e.g. "пятитысячный" will eventually be formed. (If we didn't do this, the incorrect phrase
    # "пять тысячный" would be formed).
    # We do this for all thousands from "(*2 1000 *)" —> "2000" to "(*20 1000 *)" —> "20000".
    # We do not go higher, in order to prevent the WFST graph becoming even larger.
    complex_numbers = pynini.cross("(* 2 1000 *)", "2000")
    for number in range(3, 21):
        complex_numbers |= pynini.cross(f"(* {number} 1000 *)", f"{number}000")

    complex_numbers = (NEMO_SIGMA + pynutil.add_weight(complex_numbers, -1) +
                       pynini.closure(pynini.union(" ", ")", "(", "+", "*")))
    fg_ordinal = pynutil.add_weight(pynini.compose(fg, complex_numbers),
                                    -1) | fg
    ordinal_name = pynini.string_file(
        get_abs_path("data/numbers/ordinals.tsv"))
    ordinal_l = (pynini.closure(cardinal_name_nominative + pynini.accep(" ")) +
                 ordinal_name).optimize()

    # Composes L with the leaf transducer (P), then composes that with FG.
    p = a['LEAVES']
    number_names = {}
    number_names['ordinal_number_names'] = (
        fg_ordinal @ (p @ ordinal_l)).optimize()
    number_names['cardinal_number_names'] = (fg @ (p @ cardinal_l)).optimize()
    number_names['nominative_up_to_thousand_names'] = (
        fg @ (p @ nominative_up_to_thousand_name_l)).optimize()
    return number_names
示例#10
0
        ("Ё́", "Е'"),
        ("И́", "И'"),
        ("О́", "О'"),
        ("У́", "У'"),
        ("Ы́", "Ы'"),
        ("Э́", "Э'"),
        ("Ю́", "Ю'"),
        ("Я́", "Я'"),
        ("а́", "а'"),
        ("е́", "е'"),
        ("ё́", "е'"),
        ("и́", "и'"),
        ("о́", "о'"),
        ("у́", "у'"),
        ("ы́", "ы'"),
        ("э́", "э'"),
        ("ю́", "ю'"),
        ("я́", "я'"),
        ("ё", "е"),
        ("Ё", "Е"),
    ]

    REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
    TO_LATIN = pynini.string_file(get_abs_path("data/cyrillic_to_latin.tsv"))
    RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE, NEMO_NON_BREAKING_SPACE).optimize()

except (ModuleNotFoundError, ImportError):
    # Create placeholders
    RU_ALPHA = None
    LO_LATIN = None
示例#11
0
文件: date.py 项目: mousebaiker/NeMo
    def __init__(self, number_names: dict, deterministic: bool):
        super().__init__(name="date",
                         kind="classify",
                         deterministic=deterministic)

        # Ru format: DD-MM-YYYY or DD-MM-YY
        month_abbr_to_names = pynini.string_file(
            get_abs_path("data/months/abbr_to_name.tsv")).optimize()

        delete_sep = pynutil.add_weight(pynini.cross(
            ".", " "), 1.09) | pynutil.add_weight(
                pynini.cross(pynini.union("/", "-"), " "), 1.1)

        numbers = number_names['ordinal_number_names']

        zero = (pynutil.add_weight(pynini.cross("0", ""),
                                   -0.1)) | (pynutil.add_weight(
                                       pynini.cross("0", "ноль "), 0.1))
        zero_digit = zero + pynini.compose(NEMO_DIGIT, numbers)
        digit_day = (pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT
        digit_day = pynini.compose(digit_day, numbers)
        day = (pynutil.insert("day: \"") + (zero_digit | digit_day) +
               pynutil.insert("\"")).optimize()

        digit_month = zero_digit | pynini.compose(
            pynini.accep("1") + NEMO_DIGIT, numbers)
        month_number_to_abbr = pynini.string_file(
            get_abs_path("data/months/numbers.tsv")).optimize()
        month_number_to_abbr = (((
            (pynutil.add_weight(pynini.cross("0", ""), -0.1)
             | pynini.accep("1")) + NEMO_DIGIT) | NEMO_DIGIT).optimize()
                                @ month_number_to_abbr).optimize()

        month_name = ((month_number_to_abbr @ month_abbr_to_names) |
                      pynutil.add_weight(month_abbr_to_names, 0.1)).optimize()
        month = (pynutil.insert("month: \"") + (month_name | digit_month) +
                 pynutil.insert("\"")).optimize()
        year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)),
                              numbers).optimize()
        year |= zero_digit
        year_word_singular = ["год", "года", "году", "годом", "годе"]
        year_word_plural = [
            "годы", "годов", "годам", "годами", "годам", "годах"
        ]

        year_word = pynini.cross("г.", pynini.union(*year_word_singular))
        year_word |= pynini.cross("гг.", pynini.union(*year_word_plural))
        year_word = (pynutil.add_weight(insert_space, -0.1)
                     | pynutil.add_weight(pynini.accep(" "), 0.1)) + year_word

        year_optional = pynutil.insert("year: \"") + year + pynini.closure(
            year_word, 0, 1) + pynutil.insert("\"")
        year_optional = pynini.closure(delete_sep + year_optional, 0,
                                       1).optimize()
        year_only = pynutil.insert(
            "year: \"") + year + year_word + pynutil.insert("\"")

        tagger_graph = (day + delete_sep + month + year_optional) | year_only

        # Verbalizer
        day = (pynutil.delete("day:") + delete_space + pynutil.delete("\"") +
               pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
        month = (pynutil.delete("month:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space +
                pynutil.delete("\""))
        year_optional = pynini.closure(delete_extra_space + year, 0, 1)
        graph_dmy = day + delete_extra_space + month + year_optional
        verbalizer_graph = (graph_dmy | year) + delete_space

        self.final_graph = pynini.compose(tagger_graph,
                                          verbalizer_graph).optimize()
        self.fst = pynutil.insert(
            "day: \"") + self.final_graph + pynutil.insert("\"")
        self.fst = self.add_tokens(self.fst).optimize()
示例#12
0
文件: electronic.py 项目: manneh/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        # tagger
        accepted_symbols = []
        with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f:
            for line in f:
                symbol, _ = line.split('\t')
                accepted_symbols.append(pynini.accep(symbol))
        username = (pynutil.insert("username: \"") + NEMO_ALPHA +
                    pynini.closure(NEMO_ALPHA | NEMO_DIGIT
                                   | pynini.union(*accepted_symbols)) +
                    pynutil.insert("\"") + pynini.cross('@', ' '))
        domain_graph = (
            NEMO_ALPHA +
            (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-')
                            | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT))
        domain_graph = pynutil.insert(
            "domain: \"") + domain_graph + pynutil.insert("\"")
        tagger_graph = (username + domain_graph).optimize()

        # verbalizer
        graph_digit = pynini.string_file(
            get_abs_path(
                "data/numbers/digits_nominative_case.tsv")).optimize()
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) +
                          pynini.cross(".", "точка ") + NEMO_NOT_QUOTE +
                          pynini.closure(insert_space + NEMO_NOT_QUOTE))

        server_default = (pynini.closure(
            (graph_digit | NEMO_ALPHA) + insert_space, 1) +
                          pynini.closure(graph_symbols + insert_space) +
                          pynini.closure(
                              (graph_digit | NEMO_ALPHA) + insert_space, 1))
        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv")) + insert_space
        domain_common = pynini.cross(".", "точка ") + pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") +
                  (pynutil.add_weight(server_common, 1.09)
                   | pynutil.add_weight(server_default, 1.1)) +
                  (pynutil.add_weight(domain_common, 1.09)
                   | pynutil.add_weight(domain_default, 1.1)) + delete_space +
                  pynutil.delete("\""))

        graph = user_name + delete_space + pynutil.insert(
            "собака ") + delete_space + domain + delete_space
        # replace all latin letters with their Ru verbalization
        verbalizer_graph = (graph.optimize() @ (pynini.closure(
            TO_CYRILLIC | RU_ALPHA | pynini.accep(" ")))).optimize()
        verbalizer_graph = verbalizer_graph.optimize()

        self.final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("username: \"") + self.final_graph +
            pynutil.insert("\"")).optimize()
示例#13
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.cardinal_numbers_default
        decimal_graph = decimal.final_graph

        unit_singular = pynini.string_file(
            get_abs_path("data/currency/currency_singular.tsv"))
        unit_plural = pynini.string_file(
            get_abs_path("data/currency/currency_plural.tsv"))

        # adding weight to make sure the space is preserved for ITN
        optional_delimiter = pynini.closure(
            pynutil.add_weight(pynini.cross(NEMO_SPACE, ""), -100), 0, 1)
        graph_unit_singular = (optional_delimiter +
                               pynutil.insert(" currency: \"") +
                               unit_singular + pynutil.insert("\""))
        graph_unit_plural = optional_delimiter + pynutil.insert(
            " currency: \"") + unit_plural + pynutil.insert("\"")

        one = pynini.compose(pynini.accep("1"), cardinal_graph).optimize()
        singular_graph = pynutil.insert(
            "integer_part: \"") + one + pynutil.insert(
                "\"") + graph_unit_singular

        graph_decimal = decimal_graph + graph_unit_plural

        graph_integer = (pynutil.insert("integer_part: \"") +
                         ((NEMO_SIGMA - "1") @ cardinal_graph) +
                         pynutil.insert("\"") + (graph_unit_plural))

        graph_integer |= singular_graph
        tagger_graph = (graph_integer.optimize()
                        | graph_decimal.optimize()).optimize()

        # verbalizer
        integer = pynutil.delete("\"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        integer_part = pynutil.delete("integer_part: ") + integer

        unit = (pynutil.delete("currency: ") + pynutil.delete("\"") +
                pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""))
        unit = pynini.accep(NEMO_SPACE) + unit

        verbalizer_graph_cardinal = (integer_part + unit).optimize()

        fractional_part = pynutil.delete("fractional_part: ") + integer
        optional_quantity = pynini.closure(
            pynini.accep(NEMO_SPACE) + pynutil.delete("quantity: ") + integer,
            0, 1)

        verbalizer_graph_decimal = (pynutil.delete('decimal { ') +
                                    integer_part + pynini.accep(" ") +
                                    fractional_part + optional_quantity +
                                    pynutil.delete(" }") + unit)

        verbalizer_graph = (verbalizer_graph_cardinal
                            | verbalizer_graph_decimal).optimize()

        self.final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("integer_part: \"") + self.final_graph +
            pynutil.insert("\"")).optimize()
示例#14
0
文件: measure.py 项目: quuhua911/NeMo
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="measure",
                         kind="classify",
                         deterministic=deterministic)

        # adding weight to make sure the space is preserved for ITN
        delete_space = pynini.closure(
            pynutil.add_weight(
                pynutil.delete(
                    pynini.union(NEMO_SPACE, NEMO_NON_BREAKING_SPACE)), -1), 0,
            1)

        cardinal_graph = cardinal.cardinal_numbers_default
        cardinal_graph_nominative = cardinal.cardinal_numbers_nominative
        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        optional_graph_negative = cardinal.optional_graph_negative

        space_for_units = (
            pynutil.add_weight(pynutil.insert(NEMO_NON_BREAKING_SPACE), -0.1)
            | pynutil.add_weight(pynutil.insert(NEMO_SPACE), 0.1)).optimize()
        slash_unit = (pynini.cross("/", "в")
                      | pynini.cross("/", "за")) + space_for_units + graph_unit

        unit_slash_unit = pynutil.add_weight(
            graph_unit + space_for_units + slash_unit, -0.1)
        default_units = pynutil.insert("units: \"") + (
            graph_unit | unit_slash_unit) + pynutil.insert("\"")
        slash_units = pynutil.insert(
            "units: \"") + slash_unit + pynutil.insert("\"")
        subgraph_decimal = decimal.final_graph + (
            (delete_space + default_units) | slash_units)

        cardinal_space = (
            pynutil.insert("cardinal { ") + optional_graph_negative +
            pynutil.insert("integer: \"") + cardinal_graph +
            ((delete_space + pynutil.insert("\"") + pynutil.insert(" } ") +
              default_units)
             | (pynutil.insert("\"") + pynutil.insert(" } ") + slash_units)))

        cardinal_optional_dash_alpha = (
            pynutil.insert("cardinal { integer: \"") + cardinal_graph +
            pynini.closure(pynini.cross('-', ''), 0, 1) +
            pynutil.insert("\" } units: \"") + pynini.closure(RU_ALPHA, 1) +
            pynutil.insert("\""))

        alpha_optional_dash_cardinal = (
            pynutil.insert("units: \"") + pynini.closure(RU_ALPHA, 1) +
            pynini.closure(pynini.cross('-', ''), 0, 1) +
            pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") +
            cardinal_graph_nominative +
            pynutil.insert("\" } preserve_order: true"))

        decimal_dash_alpha = (decimal.final_graph + pynini.cross('-', '') +
                              pynutil.insert(" units: \"") +
                              pynini.closure(RU_ALPHA, 1) +
                              pynutil.insert("\""))

        alpha_dash_decimal = (pynutil.insert("units: \"") +
                              pynini.closure(RU_ALPHA, 1) +
                              pynini.cross('-', '') + pynutil.insert("\" ") +
                              decimal.final_graph +
                              pynutil.insert(" preserve_order: true"))

        self.tagger_graph_default = (subgraph_decimal
                                     | cardinal_space).optimize()

        tagger_graph = (self.tagger_graph_default
                        | cardinal_optional_dash_alpha
                        | alpha_optional_dash_cardinal
                        | decimal_dash_alpha
                        | alpha_dash_decimal).optimize()

        # verbalizer
        unit = pynutil.delete("units: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space

        optional_sign = pynini.closure(
            pynini.cross("negative: \"true\" ", "минус "), 0, 1)
        integer = pynutil.delete(" \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        integer_part = pynutil.delete("integer_part:") + integer
        fractional_part = pynutil.delete("fractional_part:") + integer
        optional_quantity_part = pynini.closure(
            pynini.accep(" ") + pynutil.delete("quantity: \"") +
            pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""),
            0,
            1,
        )
        graph_decimal = optional_sign + integer_part + pynini.accep(
            " ") + fractional_part + optional_quantity_part

        graph_decimal = pynutil.delete(
            "decimal {"
        ) + delete_space + graph_decimal + delete_space + pynutil.delete("}")

        graph_cardinal = (pynutil.delete("cardinal {") + delete_space +
                          optional_sign + pynutil.delete("integer: \"") +
                          pynini.closure(NEMO_NOT_QUOTE, 1) +
                          pynutil.delete("\"") + delete_space +
                          pynutil.delete("}"))

        verbalizer_graph = (graph_cardinal |
                            graph_decimal) + delete_space + insert_space + unit

        # SH adds "preserve_order: true" by default
        preserve_order = pynutil.delete(
            "preserve_order:") + delete_space + pynutil.delete(
                "true") + delete_space
        verbalizer_graph |= (unit + insert_space +
                             (graph_cardinal | graph_decimal) + delete_space +
                             pynini.closure(preserve_order, 0, 1))
        self.verbalizer_graph = verbalizer_graph.optimize()

        final_graph = (tagger_graph @ verbalizer_graph).optimize()
        self.fst = self.add_tokens(
            pynutil.insert("cardinal { integer: \"") + final_graph +
            pynutil.insert("\" }")).optimize()
示例#15
0
        ("Ы́", "Ы'"),
        ("Э́", "Э'"),
        ("Ю́", "Ю'"),
        ("Я́", "Я'"),
        ("а́", "а'"),
        ("е́", "е'"),
        ("ё́", "е'"),
        ("и́", "и'"),
        ("о́", "о'"),
        ("у́", "у'"),
        ("ы́", "ы'"),
        ("э́", "э'"),
        ("ю́", "ю'"),
        ("я́", "я'"),
        ("ё", "е"),
        ("Ё", "Е"),
    ]

    REWRITE_STRESSED = pynini.closure(
        pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
    TO_CYRILLIC = pynini.string_file(
        get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
    TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
    RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE,
                                     NEMO_NON_BREAKING_SPACE).optimize()

except (ModuleNotFoundError, ImportError):
    # Create placeholders
    RU_ALPHA = None
    LO_LATIN = None
示例#16
0
文件: time.py 项目: quuhua911/NeMo
    def __init__(self, number_names: dict, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        increment_hour_ordinal = pynini.string_file(
            get_abs_path("data/time/increment_hour_ordinal.tsv"))
        increment_hour_cardinal = pynini.string_file(
            get_abs_path("data/time/increment_hour_cardinal.tsv"))
        convert_hour = pynini.string_file(
            get_abs_path("data/time/time_convert.tsv"))

        number = pynini.closure(pynini.cross("0", ""), 0,
                                1) + number_names['cardinal_names_nominative']
        hour_options = pynini.project(increment_hour_ordinal, "input")
        hour_options = hour_options | pynini.project(convert_hour, "output")

        hour_exeption_ends_with_one = pynini.union(*["01", "21"])
        hour_exeption_ends_rest = pynini.union(*["02", "03", "04", "22", "23"])
        hour_other = (pynini.difference(
            hour_options,
            pynini.union(hour_exeption_ends_with_one,
                         hour_exeption_ends_rest))).optimize()

        hour = hour_exeption_ends_with_one @ number + pynutil.insert(" час")
        hour |= hour_exeption_ends_rest @ number + pynutil.insert(" часа")
        hour |= hour_other @ number + pynutil.insert(" часов")

        optional_and = pynini.closure(pynutil.insert("и "), 0, 1)
        digits = pynini.union(*[str(x) for x in range(10)])
        mins_start = pynini.union(*"012345")
        mins_options = mins_start + digits
        mins_exception_ends_with_one = mins_start + pynini.accep("1")
        mins_exception_ends_rest = pynini.difference(
            mins_start + pynini.union(*"234"),
            pynini.union(*["12", "13", "14"]))
        mins_other = pynini.difference(
            mins_options,
            pynini.union(mins_exception_ends_with_one,
                         mins_exception_ends_rest))

        minutes = mins_exception_ends_with_one @ number + pynutil.insert(
            " минута")
        minutes |= mins_exception_ends_rest @ number + pynutil.insert(
            " минуты")
        minutes |= mins_other @ number + pynutil.insert(" минут")
        self.minutes = minutes.optimize()
        # 17:15 -> "семнадцать часов и пятнадцать минут"
        hm = (pynutil.insert("hours: \"") + hour.optimize() +
              pynutil.insert("\"") +
              (pynini.cross(":", " ") + pynutil.insert("minutes: \"") +
               optional_and + minutes.optimize()) + pynutil.insert("\"") +
              pynutil.insert(" preserve_order: true"))
        h = pynutil.insert("hours: \"") + hour + pynutil.insert(
            "\"") + pynutil.delete(":00")
        self.graph_preserve_order = (hm | h).optimize()

        # 17:15 -> "пятнадцать минут шестого"
        # Requires permutations for the correct verbalization
        self.increment_hour_ordinal = pynini.compose(
            hour_options, increment_hour_ordinal).optimize()
        m_next_h = (pynutil.insert("hours: \"") + self.increment_hour_ordinal +
                    pynutil.insert("\"") + pynini.cross(":", " ") +
                    pynutil.insert("minutes: \"") + minutes +
                    pynutil.insert("\""))

        # 17:45 -> "без пятнадцати минут шесть"
        # Requires permutations for the correct verbalization
        self.mins_to_h = pynini.string_file(
            get_abs_path("data/time/minutes_to_hour.tsv")).optimize()
        self.increment_hour_cardinal = pynini.compose(
            hour_options, increment_hour_cardinal).optimize()
        m_to_h = (pynutil.insert("hours: \"") + self.increment_hour_cardinal +
                  pynutil.insert("\"") + pynini.cross(":", " ") +
                  pynutil.insert("minutes: \"без ") + self.mins_to_h +
                  pynutil.insert("\""))

        self.final_graph = m_next_h | self.graph_preserve_order | m_to_h
        self.fst = self.add_tokens(self.final_graph)
        self.fst = self.fst.optimize()