Exemplo n.º 1
0
Arquivo: roman.py Projeto: NVIDIA/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="roman",
                         kind="verbalize",
                         deterministic=deterministic)
        suffix = OrdinalFst().suffix

        cardinal = pynini.closure(NEMO_NOT_QUOTE)
        ordinal = pynini.compose(cardinal, suffix)

        graph = (pynutil.delete("key_cardinal: \"") +
                 pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") +
                 pynini.accep(" ") + pynutil.delete("integer: \"") + cardinal +
                 pynutil.delete("\"")).optimize()

        graph |= (pynutil.delete("default_cardinal: \"default\" integer: \"") +
                  cardinal + pynutil.delete("\"")).optimize()

        graph |= (pynutil.delete("default_ordinal: \"default\" integer: \"") +
                  ordinal + pynutil.delete("\"")).optimize()

        graph |= (pynutil.delete("key_the_ordinal: \"") +
                  pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") +
                  pynini.accep(" ") + pynutil.delete("integer: \"") +
                  pynini.closure(pynutil.insert("the "), 0, 1) + ordinal +
                  pynutil.delete("\"")).optimize()

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 2
0
    def __init__(self, tn_decimal, deterministic: bool = False):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("минус", "\"true\"") +
            delete_extra_space, 0, 1)

        graph_fractional_part = pynini.invert(
            tn_decimal.graph_fractional).optimize()
        graph_integer_part = pynini.invert(tn_decimal.integer_part).optimize()
        optional_graph_quantity = pynini.invert(
            tn_decimal.optional_quantity).optimize()

        graph_fractional = pynutil.insert(
            "fractional_part: \"") + graph_fractional_part + pynutil.insert(
                "\"")
        graph_integer = pynutil.insert(
            "integer_part: \"") + graph_integer_part + pynutil.insert("\"")
        optional_graph_quantity = pynutil.insert(
            "quantity: \"") + optional_graph_quantity + pynutil.insert("\"")
        optional_graph_quantity = pynini.closure(
            pynini.accep(NEMO_SPACE) + optional_graph_quantity, 0, 1)

        self.final_graph_wo_sign = (graph_integer + pynini.accep(NEMO_SPACE) +
                                    graph_fractional + optional_graph_quantity)
        final_graph = optional_graph_negative + self.final_graph_wo_sign

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 3
0
def load_lexicon(source, symbol_table):
  '''
  Load lexica entries from source interpreting them using a given symbol table.
  '''
  lex = pynini.Fst()
  lex.set_input_symbols(symbol_table)
  lex.set_output_symbols(symbol_table)
  # longest match, prefer complex over simple symbols
  tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U)
  for line in source:
    line = line.strip()
    if line:
      tmp = pynini.Fst()
      tmp.set_input_symbols(symbol_table)
      tmp.set_output_symbols(symbol_table)
      start = tmp.add_state()
      tmp.set_start(start)
      tmp.set_final(start)
      for token in tokenizer.findall(line):
        if token[1]:
          tmp1 = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table))
          tmp2 = pynini.concat(tmp, pynini.accep(token[1], token_type=symbol_table))
          tmp = pynini.concat(tmp, pynini.cross(tmp1, tmp2))
        else:
          tmp = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table))
      lex = pynini.union(lex, tmp)
  return lex
Exemplo n.º 4
0
  def __construct_compound_filter(self):
    '''
    Construct the compound filter
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"),
          self.__syms.stem_types,
          pynini.cross(self.__syms.categories, ""),
          pynini.cross(self.__syms.origin_features, ""),
          pynini.cross("<NoPref>", "")
          )

      return pynini.concat(
          pynini.union(
            pynini.cross("<Initial>", ""),
            pynini.accep("<NoHy>"),
            pynini.accep("<NoDef>")
            ).closure(0,1),
          pynini.concat(
            pynini.union(
              pynini.concat(
                alphabet.closure(),
                pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "")
                ),
              pynini.concat(
                pynini.cross("", "<VADJ>"),
                pynini.concat(
                  pynini.union(
                    alphabet,
                    pynini.cross("<kompos>", "")
                    ).closure(),
                  pynini.concat(
                    pynini.cross("<kompos>", ""),
                    pynini.concat(
                      alphabet.closure(),
                      pynini.cross("<V>", "")
                      )
                    )
                  )
                ),
              pynini.concat(
                pynini.union(
                  alphabet,
                  pynini.cross("<kompos>", "")
                  ).closure(),
                pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "")
                )
              ),
            pynini.concat(
              pynini.cross("<base>", ""),
              pynini.concat(
                pynini.cross(self.__syms.origin_features, ""),
                self.__syms.inflection_classes
                )
              )
            )
          ).optimize()
Exemplo n.º 5
0
    def __init__(self, ordinal: GraphFst, cardinal: GraphFst):
        super().__init__(name="date", kind="classify")

        self.cardinal = cardinal
        ordinal_graph = ordinal.graph
        year_graph = self._get_year_graph()
        YEAR_WEIGHT = 0.001
        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
        month_graph = _get_month_graph()

        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        day_graph = pynutil.insert("day: \"") + pynutil.add_weight(
            ordinal_graph, -0.7) + pynutil.insert("\"")
        optional_graph_year = pynini.closure(
            delete_extra_space + pynutil.insert("year: \"") +
            pynutil.add_weight(year_graph, -YEAR_WEIGHT) +
            pynutil.insert("\""),
            0,
            1,
        )
        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_year = (pynutil.insert("year: \"") + year_graph + pynini.closure(
            pynini.accep('er') + pynini.closure(pynini.accep('n'), 0, 1), 0, 1)
                      + pynutil.insert("\""))

        final_graph = graph_dmy | graph_year
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 6
0
    def __init__(self, cardinal, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        numerator = (pynutil.insert("numerator: \"") + cardinal_graph +
                     (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" ")))

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""),
                                      0, 1)

        denominator = pynutil.insert(
            "denominator: \""
        ) + cardinal_graph + optional_end + pynutil.insert("\"")

        graph = pynini.closure(integer + pynini.accep(" "), 0,
                               1) + (numerator + denominator)
        graph |= pynini.closure(
            integer +
            (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose(
                pynini.string_file(get_abs_path("data/number/fraction.tsv")),
                (numerator + denominator))

        self.graph = graph
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()
Exemplo n.º 7
0
def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike':
    """
    Returns year verbalizations as fst

     < 2000 neunzehn (hundert) (vier und zwanzig), >= 2000 regular cardinal
    **00 ** hundert

    Args:
        delete_leading_zero: removed leading zero
        cardinal: cardinal GraphFst
    """

    year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT**2) @ cardinal.graph

    graph_two_digit = delete_leading_zero @ cardinal.two_digit_non_zero
    hundred = pynutil.insert("hundert")
    graph_double_double = ((pynini.accep("1") + NEMO_DIGIT) @ graph_two_digit +
                           insert_space +
                           pynini.closure(hundred + insert_space, 0, 1) +
                           graph_two_digit)
    # for 20**
    graph_double_double |= pynini.accep(
        "20") @ graph_two_digit + insert_space + graph_two_digit
    graph = (graph_double_double
             | (pynini.accep("1") + NEMO_DIGIT) @ graph_two_digit +
             insert_space + pynutil.delete("00") + hundred
             | year_gt_2000)
    return graph
Exemplo n.º 8
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        cardinal_format = pynini.closure(NEMO_DIGIT | pynini.accep(","))
        st_format = (pynini.closure(cardinal_format +
                                    (NEMO_DIGIT - "1"), 0, 1) +
                     pynini.accep("1") +
                     pynutil.delete(pynini.union("st", "ST")))
        nd_format = (pynini.closure(cardinal_format +
                                    (NEMO_DIGIT - "1"), 0, 1) +
                     pynini.accep("2") +
                     pynutil.delete(pynini.union("nd", "ND")))
        rd_format = (pynini.closure(cardinal_format +
                                    (NEMO_DIGIT - "1"), 0, 1) +
                     pynini.accep("3") +
                     pynutil.delete(pynini.union("rd", "RD")))
        th_format = pynini.closure(
            (NEMO_DIGIT - "1" - "2" - "3")
            | (cardinal_format + "1" + NEMO_DIGIT)
            | (cardinal_format + (NEMO_DIGIT - "1") +
               (NEMO_DIGIT - "1" - "2" - "3")),
            1,
        ) + pynutil.delete(pynini.union("th", "TH"))
        self.graph = (st_format | nd_format | rd_format
                      | th_format) @ cardinal_graph
        final_graph = pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 9
0
  def __construct_insert_zu(self):
    '''
    Inserts "zu" into infinitives with separable prefixes
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input")
          ).optimize()

      c2 = pynini.union(
          alphabet,
          self.__syms.stem_types
          ).closure().optimize()
      
      # From deko.fst:
      # insert "zu" after verbal prefixes if followed by infinitive marker
      return pynini.union(
          c2,
          #pynini.concat(
          #  pynini.accep("<Base_Stems>"),
          #  alphabet.closure(),
          #  pynini.cross("<^zz>", ""),
          #  alphabet.closure()
          #  ),
          c2
          + pynini.accep("<Pref_Stems>")
          + alphabet.closure()
          + pynini.accep("<Base_Stems>")
          + pynini.cross("", "z u")
          + alphabet.closure()
          + pynini.cross("<^zz>", "")
          + alphabet.closure()
          ).optimize()
Exemplo n.º 10
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic", kind="classify", deterministic=deterministic)

        accepted_symbols = []
        with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f:
            for line in f:
                symbol, _ = line.split('\t')
                accepted_symbols.append(pynini.accep(symbol))

        username = (
            pynutil.insert("username: \"")
            + NEMO_ALPHA
            + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols))
            + pynutil.insert("\"")
            + pynini.cross('@', ' ')
        )
        domain_graph = (
            NEMO_ALPHA
            + (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-') | pynini.accep('.')))
            + (NEMO_ALPHA | NEMO_DIGIT)
        )
        domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
        graph = username + domain_graph

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 11
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="punctuation",
                         kind="classify",
                         deterministic=deterministic)
        s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""

        punct_symbols_to_exclude = ["[", "]"]
        punct_unicode = [
            chr(i) for i in range(sys.maxunicode)
            if category(chr(i)).startswith("P")
            and chr(i) not in punct_symbols_to_exclude
        ]

        whitelist_symbols = load_labels(
            get_abs_path("data/whitelist/symbol.tsv"))
        whitelist_symbols = [x[0] for x in whitelist_symbols]
        self.punct_marks = [
            p for p in punct_unicode + list(s) if p not in whitelist_symbols
        ]

        punct = pynini.union(*self.punct_marks)
        punct = pynini.closure(punct, 1)

        emphasis = (pynini.accep("<") + (
            (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) +
             pynini.closure(pynini.accep("/"), 0, 1))
            | (pynini.accep("/") +
               pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) +
                    pynini.accep(">"))
        punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)

        self.graph = punct
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 12
0
    def __init__(self, tn_time: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        tn_time_tagger = tn_time.graph_preserve_order
        tn_time_verbalizer = TNTimeVerbalizer().graph
        tn_time_graph_preserve_order = pynini.compose(
            tn_time_tagger, tn_time_verbalizer).optimize()
        graph_preserve_order = pynini.invert(
            tn_time_graph_preserve_order).optimize()
        graph_preserve_order = pynutil.insert(
            "hours: \"") + graph_preserve_order + pynutil.insert("\"")

        # "пятнадцать минут шестого" -> 17:15
        # Requires permutations for the correct verbalization
        m_next_h = (pynutil.insert("minutes: \"") +
                    pynini.invert(tn_time.minutes).optimize() +
                    pynutil.insert("\"") + pynini.accep(NEMO_SPACE) +
                    pynutil.insert("hours: \"") +
                    pynini.invert(tn_time.increment_hour_ordinal).optimize() +
                    pynutil.insert("\"")).optimize()

        # "без пятнадцати минут шесть" -> 17:45
        # Requires permutation for the correct verbalization
        m_to_h = (pynini.cross("без ", "minutes: \"") +
                  pynini.invert(tn_time.mins_to_h) + pynutil.insert("\"") +
                  pynini.accep(NEMO_SPACE) + pynutil.insert("hours: \"") +
                  pynini.invert(tn_time.increment_hour_cardinal).optimize() +
                  pynutil.insert("\""))

        graph_reserve_order = m_next_h | m_to_h
        graph = graph_preserve_order | graph_reserve_order
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemplo n.º 13
0
  def __construct_suff_phon(self):
    '''
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"),
          self.__syms.stem_types,
          ).closure()

      Tau = pynini.cross("i", "")
      Lambda = pynini.concat(
          pynini.union(
            pynini.accep("i"),
            pynini.concat(
              self.__syms.consonants.project("input"),
              pynini.accep("y")
              )
            ),
          pynini.accep("<Suff_Stems>")
          )

      return pynini.concat(
          pynini.cdrewrite(
            Tau,
            Lambda,
            "",
            alphabet.project("input")
            ),
          self.__tail
          ).optimize()
Exemplo n.º 14
0
 def __construct_compound_stems_nn(self, tmp):
     '''
 Default noun compounding stems
 '''
     with pynini.default_token_type(self.__syms.alphabet):
         kompos_stems = pynini.compose(
             pynini.concat(
                 self.__syms.characters.closure(1),
                 pynini.union(
                     pynini.cross(
                         "",
                         pynini.concat(
                             pynini.accep("<+NN>"),
                             pynini.concat(self.__syms.gender,
                                           pynini.accep("<Nom> <Sg>")))),
                     pynini.cross(
                         "",
                         pynini.concat(
                             pynini.accep("<+NN>"),
                             pynini.concat(self.__syms.gender,
                                           pynini.accep("<Nom> <Pl>")))))),
             tmp)
         return (pynini.cross("", "<Kompos_Stems>") + kompos_stems +
                 pynini.accep("<NN>") +
                 pynini.cross("", "<kompos> <nativ>")).optimize()
Exemplo n.º 15
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="decimal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        cardinal_graph_hundred_component_at_least_one_none_zero_digit = (
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)

        self.graph = cardinal.single_digits_graph.optimize()

        if not deterministic:
            self.graph = self.graph | cardinal_graph

        point = pynutil.delete(".")
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        self.graph_fractional = pynutil.insert(
            "fractional_part: \"") + self.graph + pynutil.insert("\"")
        self.graph_integer = pynutil.insert(
            "integer_part: \"") + cardinal_graph + pynutil.insert("\"")
        final_graph_wo_sign = (
            pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) +
            point + pynutil.insert(" ") + self.graph_fractional)

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal_graph_hundred_component_at_least_one_none_zero_digit)

        # reduce options for non_deterministic and allow either "oh" or "zero", but not combination
        if not deterministic:
            no_oh_zero = pynini.difference(
                NEMO_SIGMA,
                (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA)
                | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA),
            ).optimize()
            no_zero_oh = pynini.difference(
                NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA +
                pynini.accep("oh") + NEMO_SIGMA).optimize()

            self.final_graph_wo_negative |= pynini.compose(
                self.final_graph_wo_negative,
                pynini.cdrewrite(
                    pynini.cross("integer_part: \"zero\"",
                                 "integer_part: \"oh\""), NEMO_SIGMA,
                    NEMO_SIGMA, NEMO_SIGMA),
            )
            self.final_graph_wo_negative = pynini.compose(
                self.final_graph_wo_negative, no_oh_zero).optimize()
            self.final_graph_wo_negative = pynini.compose(
                self.final_graph_wo_negative, no_zero_oh).optimize()

        final_graph = optional_graph_negative + self.final_graph_wo_negative

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 16
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = self.single_digits_graph

        if not self.deterministic:
            num_graph |= self.graph

        # add space between letter and digit
        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA, NEMO_DIGIT,
                             NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), NEMO_DIGIT, NEMO_ALPHA,
                             NEMO_SIGMA),
        )

        # make sure at least one digit and letter is present
        not_space = pynini.closure(NEMO_NOT_SPACE)
        graph_with_space = pynini.compose(
            (not_space + NEMO_ALPHA + not_space + NEMO_DIGIT + not_space)
            | (not_space + NEMO_DIGIT + not_space + NEMO_ALPHA + not_space),
            graph_with_space,
        )

        keep_space = pynini.accep(" ")
        serial_graph = pynini.compose(
            graph_with_space,
            pynini.closure(pynini.closure(NEMO_ALPHA, 1) + keep_space, 1) +
            num_graph +
            pynini.closure(keep_space + pynini.closure(NEMO_ALPHA) +
                           pynini.closure(keep_space + num_graph, 0, 1)),
        )
        serial_graph |= pynini.compose(
            graph_with_space,
            num_graph + keep_space + pynini.closure(NEMO_ALPHA, 1) +
            pynini.closure(keep_space + num_graph + pynini.closure(
                keep_space + pynini.closure(NEMO_ALPHA), 0, 1)),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(delimiter + num_graph +
                                            pynutil.insert(" ") + alphas)

        serial_graph |= letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        return pynutil.add_weight(serial_graph, 2)
Exemplo n.º 17
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all")
        self.cardinal_numbers_nominative = self.get_cardinal_numbers(
            number_names, alternative_formats, mode="nominative"
        )
        self.optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1
        )

        self.cardinal_numbers_with_optional_negative = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_default
            + pynutil.insert("\"")
        )

        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize()

        # "123" -> "один два три"
        single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative)
        self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph)

        optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize()
        optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"")
        optional_quantity = pynini.closure(
            (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1
        )

        serial_graph = self.get_serial_graph()

        final_graph = (
            self.optional_graph_negative
            + pynutil.insert("integer: \"")
            + self.cardinal_numbers_with_leading_zeros
            + pynutil.insert("\"")
            + optional_quantity
        ).optimize()

        final_graph = pynutil.add_weight(final_graph, -0.1)
        final_graph |= (
            pynutil.insert("integer: \"")
            + pynutil.add_weight(self.single_digits_graph | serial_graph, 10)
            + pynutil.insert("\"")
        )
        self.final_graph = final_graph

        # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings)
        final_graph |= pynini.compose(
            pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph),
            NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA,
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 18
0
 def testFilledExporter(self):
   """Export two FSTs."""
   exporter = export.Exporter(self._filename)
   exporter['FST1'] = pynini.accep('1234')
   exporter['FST2'] = pynini.accep('4321')
   exporter.close()
   stored_fsts = _read_fst_map(self._filename)
   self.assertLen(stored_fsts, 2)
   self.assertTrue(stored_fsts['FST1'])
   self.assertTrue(stored_fsts['FST2'])
Exemplo n.º 19
0
 def __construct_verbal_pref_stems(self):
   '''
   Verbal prefix stems
   '''
   with pynini.default_token_type(self.__syms.alphabet):
     return pynini.compose(
         self.__pref_stems,
         self.__syms.initial_features.closure() +
         pynini.accep("<Pref_Stems>") +
         self.__sigma_star +
         pynini.accep("<V>", token_type=self.__syms.alphabet) +
         self.__sigma_star
         ).optimize()
Exemplo n.º 20
0
  def __construct_umlautung(self):
    '''
    Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present.
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"),
          self.__syms.stem_types,
          self.__syms.categories,
          ).closure()

      return pynini.concat(
          pynini.concat(
            alphabet,
            pynini.concat(
              self.__syms.consonants,
              pynini.concat(
                pynini.union(
                  pynini.union(
                    pynini.cross("a", "ä"),
                    pynini.cross("o", "ö"),
                    pynini.cross("u", "ü")
                    ),
                  pynini.concat(
                    pynini.cross("a", "ä"),
                    pynini.union(
                      pynini.cross("a", ""),
                      pynini.accep("u")
                      )
                    )
                  ),
                pynini.concat(
                  self.__syms.consonants.closure(),
                  pynini.concat(
                    pynini.concat(
                      pynini.accep("e"),
                      pynini.string_map(["l", "r"]).project("input")
                      ).closure(0, 1),
                    pynini.concat(
                      pynini.accep("<Suff_Stems>"),
                      pynini.cross("<UL>", "")
                      )
                    )
                  )
                )
              ).closure(0, 1)
            ),
          self.__tail
          ).optimize()
Exemplo n.º 21
0
Arquivo: word.py Projeto: NVIDIA/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
        graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )

        if not deterministic:
            phoneme = (
                pynini.accep(pynini.escape("["))
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.closure(phoneme_unit + pynini.accep(" "))
                + phoneme_unit
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.accep(pynini.escape("]"))
            )
        self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Exemplo n.º 22
0
    def __init__(
        self,
        itn_cardinal_tagger: GraphFst,
        tn_date_tagger: GraphFst,
        tn_date_verbalizer: GraphFst,
        deterministic: bool = True,
    ):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT)
        optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ", weight=0.0001))
        tagger = tn_date_verbalizer.graph.invert().optimize()

        delete_day_marker = (
            pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ itn_cardinal_tagger.graph_no_exception

        month_as_number = pynutil.delete("month: \"") + itn_cardinal_tagger.graph_no_exception + pynutil.delete("\"")
        month_as_string = pynutil.delete("month: \"") + tn_date_tagger.month_abbr.invert() + pynutil.delete("\"")

        convert_year = (tn_date_tagger.year @ optional_delete_space).invert().optimize()
        delete_year_marker = (
            pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        ) @ convert_year

        # day. month as string (year)
        verbalizer = (
            pynini.closure(delete_day_marker + pynutil.insert(".") + pynini.accep(" "), 0, 1)
            + month_as_string
            + pynini.closure(pynini.accep(" ") + delete_year_marker, 0, 1)
        )

        # day. month as number (year)
        verbalizer |= (
            delete_day_marker @ add_leading_zero_to_double_digit
            + pynutil.insert(".")
            + pynutil.delete(" ")
            + month_as_number @ add_leading_zero_to_double_digit
            + pynutil.insert(".")
            + pynini.closure(pynutil.delete(" ") + delete_year_marker, 0, 1)
        )

        # year
        verbalizer |= delete_year_marker

        final_graph = tagger @ verbalizer

        graph = pynutil.insert("name: \"") + convert_space(final_graph) + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemplo n.º 23
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        def get_input_symbols(f):
            accepted_symbols = []
            with open(f, 'r', encoding='utf-8') as f:
                for line in f:
                    symbol, _ = line.split('\t')
                    accepted_symbols.append(pynini.accep(symbol))
            return accepted_symbols

        accepted_symbols = get_input_symbols(
            get_abs_path("data/electronic/symbols.tsv"))
        accepted_common_domains = get_input_symbols(
            get_abs_path("data/electronic/domain.tsv"))
        accepted_symbols = NEMO_ALPHA + pynini.closure(
            NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols))
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()

        username = pynutil.insert(
            "username: \"") + accepted_symbols + pynutil.insert(
                "\"") + pynini.cross('@', ' ')
        domain_graph = accepted_symbols + pynini.accep('.') + accepted_symbols
        domain_graph = pynutil.insert(
            "domain: \"") + domain_graph + pynutil.insert("\"")
        domain_common_graph = (pynutil.insert("domain: \"") +
                               accepted_symbols +
                               pynini.union(*accepted_common_domains) +
                               pynutil.insert("\""))

        protocol_start = pynini.accep("https://") | pynini.accep("http://")
        protocol_symbols = pynini.closure((NEMO_ALPHA | pynutil.add_weight(
            graph_symbols | pynini.cross(":", "colon"), -0.1)) +
                                          pynutil.insert(" "))
        protocol_end = pynini.accep("www.")
        protocol = protocol_start | protocol_end | (protocol_start +
                                                    protocol_end)
        protocol = pynini.compose(protocol, protocol_symbols)
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph = username + domain_graph
        graph |= domain_common_graph
        graph |= protocol + pynutil.insert(" ") + domain_graph

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 24
0
    def __init__(self, cardinal: GraphFst, deterministic=False):
        super().__init__(name="ordinal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        endings = ["ter", "tes", "tem", "te", "ten"]
        self.graph = (
            (pynini.closure(NEMO_DIGIT | pynini.accep(".")) + pynutil.delete(
                pynutil.add_weight(pynini.union(*endings), weight=0.0001)
                | pynini.accep("."))) @ cardinal_graph).optimize()
        final_graph = pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 25
0
  def __init__(self, category: Category, *features_and_values: str) -> None:
    """Sets up an acceptor for the defined category.

    Args:
      category: a Category.
      *features_and_values: list of strings, consisting of specific
        feature-value settings such as "num=sg", "gen=mas", etc.

    Raises:
       Error: No features_and_values provided.
       Error: Invalid name.
    """
    if not features_and_values:
      raise Error("No features_and_values provided")
    self._category = category
    self._feature_settings = {}
    valid_names = frozenset(f.name for f in category.features)
    for feature_and_value in features_and_values:
      (f, v) = feature_and_value.split("=")
      if f not in valid_names:
        raise Error(f"Invalid name: {f}")
      self._feature_settings[f] = v
    acceptors = []
    for feature in category.features:
      if feature.name in self._feature_settings:
        if self._feature_settings[feature.name] not in feature.values:
          raise Error(f"Invalid name: {feature.name}")
        acceptors.append(
            pynini.accep(
                f"[{feature.name}={self._feature_settings[feature.name]}]"))
      else:
        # If not specified, allows all values.
        acceptors.append(feature.acceptor)
    self._acceptor = _concatstar(acceptors)
Exemplo n.º 26
0
  def __init__(self,
               name: str,
               *values: str,
               default: Optional[str] = None) -> None:
    """Sets up an acceptor for the defined features.

    The acceptor accepts anything in [name=v] for v in values.

    Args:
      name: a string, the name for this feature (e.g. "gender")
      *values: one or more values (e.g. "masc", "fem", "neu")
      default: if set, is the default value for this feature, which is added to
        values if not already there.
    """
    if not values:
      Error("No values provided to Feature object")
    self._name = name
    self._values = list(values)
    self._default = default
    self._default_acceptor = None
    if self._default:
      if self._default not in self._values:
        self._values.append(self._default)
      self._default_acceptor = pynini.accep(f"[{name}={self._default}]")
      self._default_acceptor.optimize()
    self._acceptor = pynini.union(*(f"[{self._name}={v}]"
                                    for v in self._values))
    self._acceptor.optimize()
Exemplo n.º 27
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
        suffix = OrdinalFst().suffix

        integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ")
        numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ")
        numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ")
        denominator = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) @ suffix | pynini.cross('four', 'quarter')
        )
        conjunction = pynutil.insert("and ")
        if not deterministic:
            conjunction = pynini.closure(conjunction, 0, 1)

        integer = pynini.closure(integer + insert_space + conjunction, 0, 1)

        denominator_half = pynini.cross("numerator: \"one\" denominator: \"two\"", "a half")
        denominator_one_two = pynini.cross("denominator: \"one\"", "over one") | pynini.cross(
            "denominator: \"two\"", "halves"
        )
        fraction_default = pynutil.add_weight(
            numerator + insert_space + denominator + pynutil.insert("s") + pynutil.delete("\""), 0.001
        )
        fraction_with_one = pynutil.add_weight(
            numerator_one + insert_space + denominator + pynutil.delete("\""), 0.0001
        )

        graph = integer + denominator_half | (fraction_with_one | fraction_default)
        graph |= pynini.cross("numerator: \"one\" denominator: \"two\"", "one half")
        graph |= (numerator | numerator_one) + insert_space + denominator_one_two

        self.graph = graph
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 28
0
    def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True):
        super().__init__(name="telephone", kind="classify", deterministic=deterministic)
        separator = pynini.accep(" ")  # between components
        digit = pynini.union(*list(map(str, range(1, 10)))) @ tn_cardinal_tagger.two_digit_non_zero
        zero = pynini.cross("0", "null")

        number_part = (
            pynutil.delete("(")
            + zero
            + insert_space
            + pynini.closure(digit + insert_space, 2, 2)
            + digit
            + pynutil.delete(")")
            + separator
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit + insert_space, 3, 3)
            + digit
        )
        graph = convert_space(pynini.invert(number_part))
        final_graph = pynutil.insert("name: \"") + graph + pynutil.insert("\"")

        self.fst = final_graph.optimize()
Exemplo n.º 29
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        dot = pynini.accep(".")
        # A.B.C. -> A. B. C.
        graph = NEMO_UPPER + dot + pynini.closure(
            insert_space + NEMO_UPPER + dot, 1)
        # A.B.C. -> A.B.C.
        graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1)
        # ABC -> ABC
        graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1)
        # ABC -> A B C
        graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)

        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemplo n.º 30
0
    def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False):
        super().__init__(name="ordinal", kind="classify", deterministic=deterministic)

        one_thousand_alternative = alternative_formats['one_thousand_alternative']
        separators = alternative_formats['separators']

        ordinal = number_names['ordinal_number_names']

        ordinal |= ordinal @ one_thousand_alternative
        ordinal_numbers = separators @ ordinal

        # to handle cases like 2-ая
        endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv"))
        not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-"))
        del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA)
        ordinal_numbers_marked = (
            ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize()
            @ (NEMO_SIGMA + endings).optimize()
            @ del_ending
        ).optimize()

        self.ordinal_numbers = ordinal_numbers
        # "03" -> remove leading zeros and verbalize
        leading_zeros = pynini.closure(pynini.cross("0", ""))
        self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize()

        final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize()
        final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()