Exemplo n.º 1
0
def get_quantity(decimal: 'pynini.FstLike',
                 cardinal_up_to_hundred: 'pynini.FstLike',
                 include_abbr: bool) -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. 1 million -> integer_part: "one" quantity: "million"
    e.g. 1.5 million -> integer_part: "one" fractional_part: "five" quantity: "million"

    Args: 
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    quantity_wo_thousand = pynini.project(quantities, "input") - pynini.union(
        "k", "K", "thousand")
    if include_abbr:
        quantity_wo_thousand |= pynini.project(
            quantities_abbr, "input") - pynini.union("k", "K", "thousand")
    res = (pynutil.insert("integer_part: \"") + cardinal_up_to_hundred +
           pynutil.insert("\"") + pynini.closure(pynutil.delete(" "), 0, 1) +
           pynutil.insert(" quantity: \"") +
           (quantity_wo_thousand @ (quantities | quantities_abbr)) +
           pynutil.insert("\""))
    if include_abbr:
        quantity = quantities | quantities_abbr
    else:
        quantity = quantities
    res |= (decimal + pynini.closure(pynutil.delete(" "), 0, 1) +
            pynutil.insert("quantity: \"") + quantity + pynutil.insert("\""))
    return res
Exemplo n.º 2
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { cardinal { integer: "си три два пять би" } }
        """
        num_graph = self.single_digits_graph

        alpha = TO_CYRILLIC | RU_ALPHA

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        num_delimiter_num = pynini.closure(num_graph + delimiter,
                                           1) + num_graph
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter
                        | num_delimiter_num) + next_alpha_or_num

        # at least 1 alpha and 1 digit is present
        at_least_one_alpha_num = (
            NEMO_SIGMA + (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) +
            NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA) | (
                NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA +
                (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) + NEMO_SIGMA)
        serial_graph = pynini.compose(at_least_one_alpha_num,
                                      serial_graph.optimize()).optimize()
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph +
                         pynini.closure(delimiter + num_graph)).optimize()
        return serial_graph.optimize()
Exemplo n.º 3
0
    def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

        # add_space_between_chars = pynini.cdrewrite(pynini.closure(insert_space, 0, 1), NEMO_CHAR, NEMO_CHAR, NEMO_SIGMA)
        optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" "))

        graph = (tn_cardinal_tagger.graph @ optional_delete_space).invert().optimize()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            (tn_cardinal_tagger.graph_hundred_component_at_least_one_none_zero_digit @ optional_delete_space)
            .invert()
            .optimize()
        )

        self.graph_ties = (tn_cardinal_tagger.two_digit_non_zero @ optional_delete_space).invert().optimize()
        # this is to make sure if there is an ambiguity with decimal, decimal is chosen, e.g. 1000000 vs. 1 million
        graph = pynutil.add_weight(graph, weight=0.001)
        self.graph_no_exception = graph
        self.digit = pynini.arcmap(tn_cardinal_tagger.digit, map_type="rmweight").invert().optimize()
        graph_exception = pynini.project(self.digit, 'input')
        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph

        self.optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus ", "\"-\" "), 0, 1
        )

        final_graph = self.optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 4
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        dot = pynini.accep(".")
        # A.B.C. -> A. B. C.
        graph = NEMO_UPPER + dot + pynini.closure(
            insert_space + NEMO_UPPER + dot, 1)
        # A.B.C. -> A.B.C.
        graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1)
        # ABC -> ABC
        graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1)
        # ABC -> A B C
        graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)

        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemplo n.º 5
0
    def __init__(self,
                 whitelist: 'pynini.FstLike',
                 deterministic: bool = True):
        super().__init__(name="abbreviation",
                         kind="classify",
                         deterministic=deterministic)

        main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1)
        misc_graph = pynutil.add_weight(
            TO_LOWER +
            pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)),
            110)
        misc_graph |= pynutil.add_weight(
            pynini.closure(NEMO_UPPER, 2) +
            pynini.closure(insert_space + NEMO_LOWER, 1), 110)
        misc_graph |= (
            NEMO_UPPER + pynutil.delete(".") +
            pynini.closure(insert_space + NEMO_UPPER + pynutil.delete(".")))
        misc_graph |= pynutil.add_weight(
            TO_LOWER + pynutil.delete(".") +
            pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110)

        # set weight of the misc graph to the value higher then word
        graph = pynutil.add_weight(main_graph.optimize(),
                                   10) | pynutil.add_weight(
                                       misc_graph.optimize(), 101)

        # exclude words that are included in the whitelist
        graph = pynini.compose(
            pynini.difference(pynini.project(graph, "input"),
                              pynini.project(whitelist.graph, "input")), graph)
        graph = pynutil.insert(
            "value: \"") + graph.optimize() + pynutil.insert("\"")
        graph = self.add_tokens(graph)
        self.fst = graph.optimize()
Exemplo n.º 6
0
    def __init__(self, input_case: str, deterministic: bool = True):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
            whitelist = load_labels(get_abs_path(file))
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            else:
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case)

        units_graph = _get_whitelist_graph(input_case,
                                           file="data/measurements.tsv")
        # do not replace single letter units, like `м` or `°`
        units_graph = pynini.compose(
            pynini.difference(pynini.project(units_graph, "input"),
                              NEMO_ALPHA), units_graph)
        graph |= units_graph.optimize()
        graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN)

        self.final_graph = convert_space(graph)
        self.fst = (pynutil.insert("name: \"") + self.final_graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 7
0
  def _make_sigma_star(self) -> pynini.Fst:
    r"""Convenience function generating \Sigma^* including feature labels.

    Returns:
      A \Sigma^* transducer.
    """
    feature_labels = pynini.project(self._feature_mapper, "input")
    return pynini.union(byte.BYTE, feature_labels).closure().optimize()
Exemplo n.º 8
0
def get_one_to_one_thousand(cardinal: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Produces an acceptor for verbalizations of all numbers from 1 to 1000. Needed for ordinals and fractions.

    Args:
        cardinal: CardinalFst

    Returns:
        fst: A pynini.FstLike object
    """
    numbers = pynini.string_map([str(_) for _ in range(1, 1000)]) @ cardinal
    return pynini.project(numbers, "output").optimize()
Exemplo n.º 9
0
def get_quantity(decimal: "pynini.FstLike",
                 cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike":
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. một triệu -> integer_part: "1" quantity: "triệu"
    e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"

    Args:
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) +
                                        pynini.difference(NEMO_DIGIT, "0") +
                                        pynini.closure(NEMO_DIGIT))
    suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn")
    graph_four = pynini.cross("tư", "4")
    graph_one = pynini.cross("mốt", "1")
    graph_half = pynini.cross("rưỡi", "5")
    last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input")
    last_digit = pynini.union(
        (pynini.project(graph_digit, "input") - last_digit_exception.arcsort())
        @ graph_digit,
        graph_one,
        graph_four,
        graph_half,
    )
    optional_fraction_graph = pynini.closure(
        delete_extra_space + pynutil.insert('fractional_part: "') +
        (last_digit | graph_half | graph_one | graph_four) +
        pynutil.insert('"'),
        0,
        1,
    )

    res = (pynutil.insert('integer_part: "') + numbers + pynutil.insert('"') +
           delete_extra_space + pynutil.insert('quantity: "') + suffix +
           pynutil.insert('"') + optional_fraction_graph)
    res |= (decimal + delete_extra_space + pynutil.insert('quantity: "') +
            (suffix | "ngàn" | "nghìn") + pynutil.insert('"'))
    return res
Exemplo n.º 10
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")
        # country code, number_part, extension
        digit_to_str = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))

        double_digit = pynini.union(
            *[
                pynini.cross(
                    pynini.project(str(i) @ digit_to_str, "output")
                    + pynini.accep(" ")
                    + pynini.project(str(i) @ digit_to_str, "output"),
                    pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"),
                )
                for i in range(10)
            ]
        )
        double_digit.invert()
        number_part = (
            pynini.closure(digit_to_str + insert_space, 2, 2)
            + digit_to_str
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit_to_str + insert_space, 2, 2)
            + digit_to_str
            + pynutil.delete("-")
            + insert_space
            + pynini.closure(digit_to_str + insert_space, 3, 3)
            + digit_to_str
        )
        number_part = (
            pynutil.insert("number_part: \"")
            + pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part)
            + pynutil.insert("\"")
        )

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 11
0
    def __init__(self,
                 itn_cardinal_tagger: GraphFst,
                 itn_decimal_tagger: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="money",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = (pynini.cdrewrite(
            pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]",
            "[EOS]", NEMO_SIGMA) @ itn_cardinal_tagger.graph_no_exception)
        graph_decimal_final = itn_decimal_tagger.final_graph_wo_negative

        graph_unit = pynini.invert(maj_singular)
        graph_unit = pynutil.insert("currency: \"") + convert_space(
            graph_unit) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)
        min_unit = pynini.project(min_singular | min_plural, "output")
        # elf euro (und) vier cent, vier cent
        cents_standalone = (pynutil.insert("fractional_part: \"") +
                            cardinal_graph @ add_leading_zero_to_double_digit +
                            delete_space + pynutil.delete(min_unit) +
                            pynutil.insert("\""))

        optional_cents_standalone = pynini.closure(
            delete_space +
            pynini.closure(pynutil.delete("und") + delete_space, 0, 1) +
            insert_space + cents_standalone,
            0,
            1,
        )
        # elf euro vierzig, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7) +
            pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit +
                         (optional_cents_standalone | optional_cents_suffix))
        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit
        graph_decimal |= pynutil.insert(
            "currency: \"€\" integer_part: \"0\" ") + cents_standalone
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 12
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="date", kind="classify")

        cardinal_graph = cardinal.graph_no_exception
        year_graph = _get_year_graph()
        YEAR_WEIGHT = 0.001
        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
        month_graph = _get_month_graph()

        month_graph = pynutil.insert(
            'month: "') + month_graph + pynutil.insert('"')
        month_exception = pynini.project(pynini.cross("năm", "5"), "input")
        month_graph_exception = (pynini.project(month_graph, "input") -
                                 month_exception.arcsort()) @ month_graph

        day_graph = pynutil.insert('day: "') + cardinal_graph + pynutil.insert(
            '"')
        # day_suffix = pynini.union("ngày", "mùng")
        # optional_day = pynini.closure(day_suffix + delete_space, 0, 1)

        graph_month = pynutil.delete(
            "tháng") + delete_space + month_graph_exception
        graph_year = (delete_extra_space + pynutil.delete("năm") +
                      delete_extra_space + pynutil.insert('year: "') +
                      pynutil.add_weight(year_graph, -YEAR_WEIGHT) +
                      pynutil.insert('"'))
        optional_graph_year = pynini.closure(graph_year, 0, 1)
        graph_my = pynutil.delete(
            "tháng") + delete_space + month_graph + graph_year
        graph_dmy = (day_graph + delete_space + pynutil.delete("tháng") +
                     delete_extra_space + month_graph + optional_graph_year)
        graph_year = (pynutil.delete("năm") + delete_extra_space +
                      pynutil.insert('year: "') + year_graph +
                      pynutil.insert('"'))

        final_graph = (graph_dmy | graph_my | graph_month
                       | graph_year) + pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 13
0
def Tester(stream, far_reader):
  """Tests rules against inputs, checking outputs.

  Args:
    stream:     input stream
    far_reader: a far reader object

  Returns: None
  """
  lineno = -1
  success = True
  for line in stream:
    lineno += 1
    fields = line.strip('\n').split('\t')
    if len(fields) != 3:
      sys.stderr.write('Skipping line %d (wrong number of fields)\n' % lineno)
      continue
    rules, input_, output = fields
    input_fst = input_
    rule_failure = False
    for rule in rules.split(','):
      try:
        rule_fst = far_reader[rule]
      except KeyError:
        sys.stderr.write('Warning: cannot find rule %s, line %d\n'
                         % (rule, lineno))
        rule_failure = True
        break
      input_fst = input_fst * rule_fst
    if rule_failure:
      success = False
      continue
    ofst = pynini.shortestpath(pynini.project(input_fst, True))
    it = pynini.StringPaths(ofst)
    pred = ''
    while not it.done():
      ## Accepts the first string
      pred = it.istring()
      break
    if pred != output:
      success = False
      sys.stderr.write('Line %d: input and output do not match for\n'
                       '   Rules:\t%s\n'
                       '   Input:\t%s\nExpected:\t%s\n  Actual:\t%s\n' %
                       (lineno, rules, input_, output, pred))
  if success:
    sys.stderr.write('All tests pass!!\n')
  else:
    sys.stderr.write('Some rewrites failed\n')
Exemplo n.º 14
0
def core_visual_norm_fsts(rewrite_file: os.PathLike,
                          preserve_file: os.PathLike,
                          consonant_file: os.PathLike,
                          sigma: pynini.Fst) -> List[pynini.Fst]:
  """Creates a visual normalization FST.

  Given a rewrite file, preserve file, and consonant file, returns an FST
  that will perform the rewrites described in the StringFile `rewrite_file`,
  additionally clearing out instances of ZWJ, ZWNJ, and ZWS except for those
  that match preserve_file when occurring between consonants (which are
  specified in the consonants file).

  Args:
    rewrite_file: Path relative to the runfiles directory of a StringFile of visual rewrites.
    preserve_file: Path relative to the runfiles directory of a StringFile of ZWJ sequences to
      preserve.
    consonant_file: Path relative to the runfiles directory of a StringFile containing a
      native--latin consonant mapping.
    sigma: An Fst with which to consider the complete alphabet for cdrewrites.
  Returns:
    Visual normalization FST.
  """
  rewrite_fst = rule.fst_from_rule_file(rewrite_file, sigma)
  preserve = uf.StringFile(preserve_file)
  consonant_map = uf.StringFile(consonant_file)
  consonant = pynini.project(consonant_map, 'input')

  # This makes sure that the generated symbols used as implementation
  # detail symbols for ZWJ preservation are considered as part of sigma.
  # Generated symbols are those delimited by square brackets, such as
  # `[ZWJ,VIRAMA]` for example.
  intermediate_sigma = u.BuildSigmaFstFromSymbolTable(
      pynini.generated_symbols()).union(sigma)

  mark_preserve = ur.Rewrite(preserve, intermediate_sigma, consonant, consonant)
  clean_joiner = ur.Rewrite(
      pynutil.delete(pynini.union(uc.ZWNJ, uc.ZWJ, uc.ZWS)), intermediate_sigma)
  reinstate = ur.Rewrite(pynini.invert(preserve), intermediate_sigma)

  return [rewrite_fst, mark_preserve, clean_joiner, reinstate,
          # We right-compose with sigma.star to ensure the generated_symbols
          # don't leak through into the visual_norm fst.
          sigma.star]
Exemplo n.º 15
0
    def _assert_fst_sampled_behavior(
            self, fsts: List[pynini.Fst], token_type: pynini.TokenType,
            samples: int, assert_function: Callable[[pynini.Fst, pynini.Fst],
                                                    None]) -> None:
        """Asserts that FST composed on samples is follow a specific behavior.

    This samples from first FST's input projection in order to assert a
    behavior when composed with the FSTs. This is used in lieu of statically
    verifying that this composition has a specific property as that isn't easy
    to answer for non-deterministic FSTs. If token_type is set to "byte", then
    the input projection of the FST is intersected with the definition of the
    closure over valid UTF-8 characters to ensure all samples are valid UTF-8
    strings that Python can handle. The maximum length of a sample is set to 100
    labels.

    Args:
      fsts: List of FSTs to be applied on a sample to verify if the resultant
          FST obeys the property specified in the function.
      token_type: The token_type used to derive the FST.
      samples: The number of input samples to take to verify functionality.
      assert_function: An assert function with  input string FSA and output FST
          as parameters. This function is run in `pynini.default_token_type`
          environment. This function raises AssertionError on assert failure.
    """
        input_language = pynini.project(fsts[0], "input")
        if token_type == "byte":
            # NOTE: Randgenning directly from the byte machine is bound to lead to
            # trouble since it can generate things that aren't well-formed UTF-8
            # sequences and thus cannot be put into a Python str type.
            input_language = pynini.intersect(input_language,
                                              utf8.VALID_UTF8_CHAR.star)
        input_samples = pynini.randgen(input_language,
                                       npath=samples,
                                       max_length=_MAX_SAMPLE_LENGTH)
        with pynini.default_token_type(token_type):
            for ilabels in _olabels_iter(input_samples):
                input_str_fsa = _label_list_to_string_fsa(ilabels)
                output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts)
                assert_function(input_str_fsa, output_fst)
Exemplo n.º 16
0
def generator_main(exporter_map: multi_grm.ExporterMapping):
  """Generates FSTs for visual normalization of Brahmic scripts."""
  for token_type in ('byte', 'utf8'):
    rewrite_map = {}
    with pynini.default_token_type(token_type):
      sigma_map = {}
      for script in u.SCRIPTS:
        sigma = u.OpenSigma(script, token_type)
        sigma_map[script] = sigma
        dedup = cu.dedup_marks_fst(script, sigma)
        nfc = open_nfc(script, token_type)
        rewrite_map[script] = ur.ComposeFsts(
            [nfc, dedup] + core_visual_norm_fsts(
                u.SCRIPT_DIR / script / 'visual_rewrite.tsv',
                u.SCRIPT_DIR / script / 'preserve.tsv',
                u.SCRIPT_DIR / script / 'consonant.tsv',
                sigma))

      for script, langs in u.LANG_SCRIPT_MAP.items():
        for lang in langs:
          sigma = sigma_map[script]
          consonant_map = uf.StringFile(u.SCRIPT_DIR / script / 'consonant.tsv')
          consonant = pynini.project(consonant_map, 'input')

          before_cons = uf.StringFile(
              u.SCRIPT_DIR / script / lang / 'before_consonant.tsv')
          rewrite_before_cons = ur.Rewrite(before_cons, sigma, right=consonant)
          after_cons = uf.StringFile(
              u.SCRIPT_DIR / script / lang / 'after_consonant.tsv')
          rewrite_after_cons = ur.Rewrite(after_cons, sigma, left=consonant)
          rewrite_map[lang] = ur.ComposeFsts([
              rewrite_map[script], rewrite_before_cons, rewrite_after_cons])

      exporter = exporter_map[token_type]
      for name, fst in rewrite_map.items():
        exporter[name.upper()] = fst
Exemplo n.º 17
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="classify",
                         deterministic=deterministic)

        accepted_symbols = pynini.project(
            pynini.string_file(get_abs_path("data/electronic/symbol.tsv")),
            "input")
        accepted_common_domains = pynini.project(
            pynini.string_file(get_abs_path("data/electronic/domain.tsv")),
            "input")
        all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA
                                                           | NEMO_DIGIT
                                                           | accepted_symbols)
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbol.tsv")).optimize()

        username = (pynutil.insert("username: \"") + all_accepted_symbols +
                    pynutil.insert("\"") + pynini.cross('@', ' '))
        domain_graph = all_accepted_symbols + pynini.accep(
            '.') + all_accepted_symbols
        protocol_symbols = pynini.closure((graph_symbols
                                           | pynini.cross(":", "semicolon")) +
                                          pynutil.insert(" "))
        protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross(
            "http", "HTTP ")) + (pynini.accep("://") @ protocol_symbols)
        protocol_file_start = pynini.accep("file") + insert_space + (
            pynini.accep(":///") @ protocol_symbols)

        protocol_end = pynini.cross(
            "www", "WWW ") + pynini.accep(".") @ protocol_symbols
        protocol = protocol_file_start | protocol_start | protocol_end | (
            protocol_start + protocol_end)

        domain_graph = (
            pynutil.insert("domain: \"") +
            pynini.difference(domain_graph,
                              pynini.project(protocol, "input") + NEMO_SIGMA) +
            pynutil.insert("\""))
        domain_common_graph = (
            pynutil.insert("domain: \"") + pynini.difference(
                all_accepted_symbols + accepted_common_domains +
                pynini.closure(
                    accepted_symbols +
                    pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols),
                    0, 1),
                pynini.project(protocol, "input") + NEMO_SIGMA,
            ) + pynutil.insert("\""))

        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        # email
        graph = username + domain_graph
        # abc.com, abc.com/123-sm
        graph |= domain_common_graph
        # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
        graph |= protocol + pynutil.insert(" ") + domain_graph

        final_graph = self.add_tokens(graph)

        self.fst = final_graph.optimize()
Exemplo n.º 18
0
    GraphFst,
    delete_preserve_order,
)
from nemo_text_processing.text_normalization.es.graph_utils import shift_cardinal_gender, strip_cardinal_apocope
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    fem = pynini.string_file(
        (get_abs_path("data/money/currency_plural_fem.tsv")))
    masc = pynini.string_file(
        (get_abs_path("data/money/currency_plural_masc.tsv")))

    fem_singular = pynini.project(fem, "input")
    masc_singular = pynini.project(masc, "input")

    fem_plural = pynini.project(fem, "output")
    masc_plural = pynini.project(masc, "output")

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    fem_plural = None
    masc_plural = None

    fem_singular = None
    masc_singular = None

    PYNINI_AVAILABLE = False
Exemplo n.º 19
0
    delete_extra_space,
    delete_preserve_order,
)
from nemo_text_processing.text_normalization.es.graph_utils import ones
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    unit_plural_fem = pynini.string_file(
        get_abs_path("data/measures/measurements_plural_fem.tsv"))
    unit_plural_masc = pynini.string_file(
        get_abs_path("data/measures/measurements_plural_masc.tsv"))

    unit_singular_fem = pynini.project(unit_plural_fem, "input")
    unit_singular_masc = pynini.project(unit_plural_masc, "input")

    unit_plural_fem = pynini.project(unit_plural_fem, "output")
    unit_plural_masc = pynini.project(unit_plural_masc, "output")

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    unit_plural_fem = None
    unit_plural_masc = None

    unit_singular_fem = None
    unit_singular_masc = None

    PYNINI_AVAILABLE = False
Exemplo n.º 20
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))
        graph_hundreds = pynini.string_file(
            get_abs_path("data/numbers/hundreds.tsv"))

        graph_hundred_component = graph_hundreds | pynutil.insert("0")
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_twenties | graph_teen | pynutil.insert("00"),
            (graph_ties | pynutil.insert("0")) + delete_space +
            (graph_digit | pynutil.insert("0")),
        )

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit)

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
            pynutil.insert("000", weight=0.1),
        )

        graph_millones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("millones") | pynutil.delete("millón")),
            pynutil.insert("000") +
            pynutil.delete("millones"),  # to allow for 'mil millones'
        )

        graph_mil_millones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
        )
        graph_mil_millones += delete_space + (
            graph_millones | pynutil.insert("000") + pynutil.delete("millones")
        )  # allow for 'mil millones'
        graph_mil_millones |= pynutil.insert("000000", weight=0.1)

        # also allow 'millardo' instead of 'mil millones'
        graph_millardo = (
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("millardo") | pynutil.delete("millardos")))

        graph_billones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("billones") | pynutil.delete("billón")), )

        graph_mil_billones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
        )
        graph_mil_billones += delete_space + (
            graph_billones | pynutil.insert("000") + pynutil.delete("billones")
        )  # allow for 'mil billones'
        graph_mil_billones |= pynutil.insert("000000", weight=0.1)

        graph_trillones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space +
            (pynutil.delete("trillones") | pynutil.delete("trillón")), )

        graph_mil_trillones = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("mil"),
            pynutil.insert("001") +
            pynutil.delete("mil"),  # because we say 'mil', not 'un mil'
        )
        graph_mil_trillones += delete_space + (
            graph_trillones | pynutil.insert("000") +
            pynutil.delete("trillones"))  # allow for 'mil trillones'
        graph_mil_trillones |= pynutil.insert("000000", weight=0.1)

        graph = pynini.union(
            (graph_mil_trillones
             | pynutil.insert("000", weight=0.1) + graph_trillones) +
            delete_space +
            (graph_mil_billones
             | pynutil.insert("000", weight=0.1) + graph_billones) +
            delete_space + pynini.union(
                graph_mil_millones,
                pynutil.insert("000", weight=0.1) + graph_millones,
                graph_millardo + graph_millones,
                graph_millardo + pynutil.insert("000", weight=0.1),
            ) + delete_space + graph_thousands + delete_space +
            graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(
                NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0")

        # ignore "y" inside cardinal numbers
        graph = (
            pynini.cdrewrite(pynutil.delete("y"), NEMO_SPACE, NEMO_SPACE,
                             NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph)

        self.graph_no_exception = graph

        # save self.numbers_up_to_thousand for use in DecimalFst
        digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
        numbers_up_to_thousand = pynini.compose(
            graph, digits_up_to_thousand).optimize()
        self.numbers_up_to_thousand = numbers_up_to_thousand

        # save self.numbers_up_to_million for use in DecimalFst
        digits_up_to_million = (NEMO_DIGIT
                                | (NEMO_DIGIT**2)
                                | (NEMO_DIGIT**3)
                                | (NEMO_DIGIT**4)
                                | (NEMO_DIGIT**5)
                                | (NEMO_DIGIT**6))
        numbers_up_to_million = pynini.compose(
            graph, digits_up_to_million).optimize()
        self.numbers_up_to_million = numbers_up_to_million

        # don't convert cardinals from zero to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero),
                                         'input')

        self.graph = (pynini.project(graph, "input") -
                      graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("menos", "\"-\"") +
            NEMO_SPACE, 0, 1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 21
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        graph_one = pynini.cross("mốt", "1")
        graph_four = pynini.cross("tư", "4")
        graph_five = pynini.cross("lăm", "5")
        graph_half = pynini.cross("rưỡi", "5")
        graph_hundred = pynini.cross("trăm", "")
        graph_ten = pynini.cross("mươi", "")
        zero = pynini.cross(pynini.union("linh", "lẻ"), "0")

        optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)
        last_digit = graph_digit | graph_one | graph_four | graph_five

        graph_hundred_component = (graph_digit
                                   | graph_zero) + delete_space + graph_hundred
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_teen,
            graph_ties + optional_ten +
            ((delete_space + last_digit) | pynutil.insert("0")),
            (graph_half | graph_four | graph_one) + pynutil.insert("0"),
            zero + delete_space + (graph_digit | graph_four),
            graph_digit,
            pynutil.insert("00"),
        )
        graph_hundred_component |= (
            pynutil.insert("0") + delete_space + pynini.union(
                graph_teen,
                graph_ties + optional_ten +
                ((delete_space + last_digit) | pynutil.insert("0")),
                zero + delete_space + (graph_digit | graph_four),
                graph_digit,
            ))

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit)

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")),
            pynutil.insert("000", weight=0.1),
        )

        graph_ten_thousand = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("vạn"),
            pynutil.insert("0000", weight=0.1),
        )

        graph_ten_thousand_suffix = pynini.union(
            graph_digit + delete_space +
            pynutil.delete(pynini.union("nghìn", "ngàn")),
            pynutil.insert("0", weight=0.1),
        )

        graph_million = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("triệu"),
            pynutil.insert("000", weight=0.1),
        )
        graph_billion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete(pynini.union("tỉ", "tỷ")),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_billion + delete_space + graph_million + delete_space +
            graph_thousands + delete_space + graph_hundred_component,
            graph_ten_thousand + delete_space + graph_ten_thousand_suffix +
            delete_space + graph_hundred_component,
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) +
            delete_space +
            ((last_digit + pynutil.insert("00")) | graph_hundred_component),
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(
                NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0")

        # don't convert cardinals from zero to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero),
                                         'input')

        self.graph_no_exception = graph

        self.graph = (pynini.project(graph, "input") -
                      graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") +
            pynini.cross(pynini.union("âm", "trừ"), "\"-\"") + NEMO_SPACE, 0,
            1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 22
0
def _input_string_file(filename: os.PathLike,
                       return_if_empty: pynini.Fst = uf.EMPTY) -> pynini.Fst:
    fst = uf.StringFile(filename, return_if_empty)
    return pynini.project(fst, 'input').rmepsilon()
Exemplo n.º 23
0
    def __init__(self,
                 cardinal: GraphFst,
                 ordinal: GraphFst,
                 deterministic: bool = True,
                 lm: bool = False):
        super().__init__(name="integer",
                         kind="classify",
                         deterministic=deterministic)
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = pynini.compose(NEMO_DIGIT**(6, ...),
                                   cardinal.single_digits_graph).optimize()
        num_graph |= pynini.compose(NEMO_DIGIT**(1, 5),
                                    cardinal.graph).optimize()
        # to handle numbers starting with zero
        num_graph |= pynini.compose(
            pynini.accep("0") + pynini.closure(NEMO_DIGIT),
            cardinal.single_digits_graph).optimize()
        # TODO: "#" doesn't work from the file
        symbols_graph = pynini.string_file(
            get_abs_path("data/whitelist/symbol.tsv")).optimize(
            ) | pynini.cross("#", "hash")
        num_graph |= symbols_graph

        if not self.deterministic and not lm:
            num_graph |= cardinal.single_digits_graph
            # also allow double digits to be pronounced as integer in serial number
            num_graph |= pynutil.add_weight(
                NEMO_DIGIT**2 @ cardinal.
                graph_hundred_component_at_least_one_none_zero_digit,
                weight=0.0001)

        # add space between letter and digit/symbol
        symbols = [
            x[0]
            for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))
        ]
        symbols = pynini.union(*symbols)
        digit_symbol = NEMO_DIGIT | symbols

        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols,
                             digit_symbol, NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), digit_symbol,
                             NEMO_ALPHA | symbols, NEMO_SIGMA),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(
            delimiter + num_graph +
            plurals._priority_union(pynini.accep(" "), pynutil.insert(" "),
                                    NEMO_SIGMA).optimize() + alphas)

        serial_graph = letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        # 2+ symbols
        serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA,
                                       num_graph + delimiter + num_graph)

        # exclude ordinal numbers from serial options
        serial_graph = pynini.compose(
            pynini.difference(NEMO_SIGMA,
                              pynini.project(ordinal.graph, "input")),
            serial_graph).optimize()

        serial_graph = pynutil.add_weight(serial_graph, 0.0001)
        serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) +
                         (pynini.cross("^2", " squared")
                          | pynini.cross("^3", " cubed")).optimize())

        # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
        serial_graph = (
            pynini.closure((serial_graph | num_graph | alphas) + delimiter) +
            serial_graph + pynini.closure(delimiter +
                                          (serial_graph | num_graph | alphas)))

        serial_graph |= pynini.compose(graph_with_space,
                                       serial_graph.optimize()).optimize()
        serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2),
                                      serial_graph).optimize()

        self.graph = serial_graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(
            self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()
Exemplo n.º 24
0
def _priority_union(q: pynini.Fst, r: pynini.Fst,
                    sigma: pynini.Fst) -> pynini.Fst:
    complement_domain_q = sigma - pynini.project(q, "input")
    return pynini.union(q, complement_domain_q @ r)
Exemplo n.º 25
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="telephone", kind="classify")
        # country code, number_part, extension
        digit_to_str = (
            pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize())
            | pynini.cross("0", pynini.union("o", "oh", "zero")).optimize()
        )

        str_to_digit = pynini.invert(digit_to_str)

        double_digit = pynini.union(
            *[
                pynini.cross(
                    pynini.project(str(i) @ digit_to_str, "output")
                    + pynini.accep(" ")
                    + pynini.project(str(i) @ digit_to_str, "output"),
                    pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"),
                )
                for i in range(10)
            ]
        )
        double_digit.invert()

        # to handle cases like "one twenty three"
        two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2)
        double_digit_to_digit = (
            pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
        )

        single_or_double_digit = (pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit).optimize()
        single_or_double_digit |= (
            single_or_double_digit
            + pynini.closure(pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001))
        ).optimize()

        number_part = pynini.compose(
            single_or_double_digit,
            NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4,
        ).optimize()
        number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"")

        cardinal_option = pynini.compose(single_or_double_digit, NEMO_DIGIT ** (2, 3))

        country_code = (
            pynutil.insert("country_code: \"")
            + pynini.closure(pynini.cross("plus ", "+"), 0, 1)
            + ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit) | cardinal_option)
            + pynutil.insert("\"")
        )

        optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize()
        graph = optional_country_code + number_part

        # credit card number
        space_four_digits = insert_space + NEMO_DIGIT ** 4
        credit_card_graph = pynini.compose(single_or_double_digit, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize()
        graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"")

        # SSN
        ssn_graph = pynini.compose(
            single_or_double_digit,
            NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4,
        ).optimize()
        graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"")

        # ip
        digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit
        digit_or_double |= double_digit_to_digit + pynini.closure(pynutil.delete(" ") + str_to_digit, 0, 1)
        digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit) ** (0, 2)
        digit_or_double |= cardinal_option
        digit_or_double = digit_or_double.optimize()

        ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3

        graph |= pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"")
        graph |= (
            pynutil.insert("number_part: \"")
            + pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001)
            + pynutil.insert("\"")
        )

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 26
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_ties_unique = pynini.string_file(
            get_abs_path("data/numbers/ties_unique.tsv"))

        # Tens components
        graph_tens_component = graph_ties + (
            (delete_hyphen + graph_digit) | pynutil.insert("0"))
        graph_tens_component = pynini.union(graph_tens_component, graph_teens,
                                            graph_ties_unique)

        graph_tens_component_with_leading_zeros = pynini.union(
            graph_tens_component,
            (pynutil.insert("0") +
             (graph_digit | pynutil.insert("0", weight=0.01))))

        # Hundreds components
        graph_cent_singular = pynutil.delete("cent")  # Used in hundreds place
        graph_cent_plural = pynini.cross(
            "cents", "00"
        )  # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201

        graph_digit_no_one = pynini.project(pynini.union("un", "une"), 'input')
        graph_digit_no_one = (pynini.project(graph_digit, "input") -
                              graph_digit_no_one.arcsort()) @ graph_digit

        graph_hundreds_component_singular = (
            graph_digit_no_one + delete_hyphen + graph_cent_singular
        )  # Regular way: [1-9] * 100

        graph_hundreds_component_singular = pynini.union(
            graph_hundreds_component_singular, pynini.cross("cent", "1"))
        graph_hundreds_component_singular += delete_hyphen
        graph_hundreds_component_singular += graph_tens_component_with_leading_zeros

        graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural

        graph_hundreds_component = pynini.union(
            graph_hundreds_component_singular,
            graph_hundreds_component_plural,
            pynutil.insert("0") + graph_tens_component_with_leading_zeros,
        )

        graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))
        self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite(
            graph_hundreds_component_at_least_one_none_zero_digit).optimize()

        # Graph thousands (we'll need this for cases of mille millions, mille milliards...)
        graph_tens_of_hundreds_component_singular = (
            graph_tens_component + delete_hyphen + graph_cent_singular
        )  # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents"
        graph_tens_of_hundreds_component_singular += delete_hyphen + graph_tens_component_with_leading_zeros
        graph_tens_of_hundreds_component_plural = graph_tens_component + delete_hyphen + graph_cent_plural
        graph_tens_of_hundred_component = (
            graph_tens_of_hundreds_component_plural
            | graph_tens_of_hundreds_component_singular)

        graph_thousands = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen + pynutil.delete("mille"),
            pynutil.insert("001") +
            pynutil.delete("mille"),  # because 'mille', not 'un mille'
            pynutil.insert("000", weight=0.1),
        )

        # All other large amounts
        graph_millions = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("million") | pynutil.delete("millions")),
            pynutil.insert("000", weight=0.1),
        )

        graph_milliards = pynini.union(  # French for English 'billion'
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("milliard") | pynutil.delete("milliards")),
            pynutil.insert("000", weight=0.1),
        )

        graph_billions = pynini.union(  # NOTE: this is English 'trillion.'
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("billions") | pynutil.delete("billion")),
            pynutil.insert("000", weight=0.1),
        )

        graph_mille_billion = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen + pynutil.delete("mille"),
            pynutil.insert("001") +
            pynutil.delete("mille"),  # because we say 'mille', not 'un mille'
        )
        graph_mille_billion += delete_hyphen + (
            graph_millions | pynutil.insert("000") + pynutil.delete("billions")
        )  # allow for 'mil millones'
        graph_mille_billion |= pynutil.insert("000000", weight=0.1)

        graph_billiards = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("billiards") | pynutil.delete("billiard")),
            pynutil.insert("000", weight=0.1),
        )

        graph_trillions = pynini.union(  # One thousand English trillions.
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("trillions") | pynutil.delete("trillion")),
            pynutil.insert("000", weight=0.1),
        )

        graph_trilliards = pynini.union(
            graph_hundreds_component_at_least_one_none_zero_digit +
            delete_hyphen +
            (pynutil.delete("trilliards") | pynutil.delete("trilliard")),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_trilliards + delete_hyphen + graph_trillions +
            delete_hyphen + graph_billiards + delete_hyphen + graph_billions +
            delete_hyphen + graph_milliards + delete_hyphen + graph_millions +
            delete_hyphen + graph_thousands + delete_hyphen +
            graph_hundreds_component,
            graph_tens_of_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(
                NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0")

        graph = rewrite(graph)

        self.graph_no_exception = graph.optimize()

        # save self.numbers_up_to_thousand for use in DecimalFst
        digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3)
        numbers_up_to_thousand = pynini.compose(
            graph, digits_up_to_thousand).optimize()
        self.numbers_up_to_thousand = numbers_up_to_thousand

        # save self.numbers_up_to_million for use in DecimalFst
        digits_up_to_million = (NEMO_DIGIT
                                | (NEMO_DIGIT**2)
                                | (NEMO_DIGIT**3)
                                | (NEMO_DIGIT**4)
                                | (NEMO_DIGIT**5)
                                | (NEMO_DIGIT**6))
        numbers_up_to_million = pynini.compose(
            graph, digits_up_to_million).optimize()
        self.numbers_up_to_million = numbers_up_to_million

        # don't convert cardinals from zero to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero),
                                         'input')

        self.graph = (pynini.project(graph, "input") -
                      graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("moins", "\"-\"") +
            NEMO_SPACE, 0, 1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 27
0
Arquivo: date.py Projeto: NVIDIA/NeMo
    def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        # january
        month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize()
        # January, JANUARY
        month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
            TO_LOWER ** (2, ...), month_graph
        )

        # jan
        month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize()
        # jan, Jan, JAN
        month_abbr_graph = (
            month_abbr_graph
            | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
            | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph.optimize()

        month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic)

        # three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph
        # year_graph |= three_digit_year

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"")

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        endings = pynini.union(*endings)

        day_graph = (
            pynutil.insert("day: \"")
            + pynini.closure(pynutil.delete("the "), 0, 1)
            + (
                ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
                + pynini.closure(pynutil.delete(endings), 0, 1)
            )
            @ cardinal_graph
            + pynutil.insert("\"")
        )

        two_digit_year = _get_two_digit_year(
            cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph
        )
        two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")

        # if lm:
        #     two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year)
        #     year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph)
        #     year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph)

        graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
        graph_year |= (
            pynutil.insert(" year: \"")
            + pynini.accep(",")
            + pynini.closure(pynini.accep(" "), 0, 1)
            + year_graph
            + pynutil.insert("\"")
        )
        optional_graph_year = pynini.closure(graph_year, 0, 1)

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")

        graph_mdy = month_graph + (
            (delete_extra_space + day_graph)
            | (pynini.accep(" ") + day_graph)
            | graph_year
            | (delete_extra_space + day_graph + graph_year)
        )

        graph_mdy |= (
            month_graph
            + pynini.cross("-", " ")
            + day_graph
            + pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1)
        )

        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_mdy |= (
                month_numbers_graph
                + delete_sep
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
                + delete_sep
                + insert_space
                + (year_graph | two_digit_year)
            )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph
        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_dmy |= (
                day_ex_month
                + delete_sep
                + insert_space
                + month_numbers_graph
                + delete_sep
                + insert_space
                + (year_graph | two_digit_year)
            )

        graph_ymd = pynini.accep("")
        for x in ["-", "/", "."]:
            delete_sep = pynutil.delete(x)
            graph_ymd |= (
                (year_graph | two_digit_year)
                + delete_sep
                + insert_space
                + month_numbers_graph
                + delete_sep
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
            )

        final_graph = graph_mdy | graph_dmy

        if not deterministic or lm:
            final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
            m_sep_d = (
                month_numbers_graph
                + pynutil.delete(pynini.union("-", "/"))
                + insert_space
                + pynini.closure(pynutil.delete("0"), 0, 1)
                + day_graph
            )
            final_graph |= m_sep_d
        else:
            final_graph += pynutil.insert(" preserve_order: true")

        final_graph |= graph_ymd | year_graph

        if not deterministic or lm:
            ymd_to_mdy_graph = None
            ymd_to_dmy_graph = None
            mdy_to_dmy_graph = None
            md_to_dm_graph = None

            for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]:
                for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr
                        if ymd_to_mdy_graph is None
                        else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
                    )

                    ymd_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
                    ymd_to_dmy_graph = (
                        ymd_to_dmy_curr
                        if ymd_to_dmy_graph is None
                        else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
                    )

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                    ).optimize()
                    # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
                    mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr
                        if mdy_to_dmy_graph is None
                        else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
                    ).optimize()

                    md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
                        "month: \"" + month + "\" day: \"" + day + "\""
                    )
                    md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()

                    md_to_dm_graph = (
                        md_to_dm_curr
                        if md_to_dm_graph is None
                        else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
                    ).optimize()

            final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 28
0
  def __init__(self, *features: Feature) -> None:
    """Sets up an acceptor for the defined category.

    The acceptor will accept a sequence valid values for each feature, where
    the ordering is given by the lexicographic order of the name of the
    features --- i.e. the order in which they are given to the constructor is
    irrelevant.

    If one has previously defined:

        case = Feature("case", "nom", "acc", "gen", "dat")
        gen = Feature("gen", "mas", "fem", "neu")
        num = Feature("num", "sg", "pl")

    Then

        noun = Category(case, gen, num)

    will allow any sequence in

        ([case=nom] | [case=nom] | [case=acc] | [case=gen] | [case=dat]) +
        ([gen=mas] | [gen=fem] | [gen=neu]) +
        ([num=sg] | [num=pl])

    The feature_filler fills in missing feature values with either the default
    for the given feature if there is one, otherwise all possible values. So if
    we have

        case: nom, gen, acc, n/a
        num: sg, pl

    where "n/a" is the default feature (specified with the default keyword to
    the Feature), then

        [num=sg]

    will be filled to

        [case=n/a][num=sg]

    but

        [case=gen]

    will be filled to

        [case=gen]([num=sg]|[num=pl])

    Args:
      *features: one or more Features.
    """
    if not features:
      Error("No features provided to Category object")
    self._features = sorted(features, key=operator.attrgetter("name"))
    self._acceptor = _concatstar(f.acceptor for f in self._features)
    self._feature_mapper = self._make_feature_mapper()
    transducers = []
    for f in self._features:
      default = f.default_acceptor if f.default_acceptor else f.acceptor
      transducers.append(pynutil.insert(default) | f.acceptor)
    self._feature_filler = _concatstar(transducers).optimize()
    self._feature_labels = pynini.project(self._feature_mapper, "input")
    self._sigma_star = pynini.union(byte.BYTE,
                                    self._feature_labels).closure().optimize()
Exemplo n.º 29
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        graph_hundred = pynini.cross("hundred", "")

        graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0"))
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_teen | pynutil.insert("00"),
            (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
        )

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
        )
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit
        )

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"),
            pynutil.insert("000", weight=0.1),
        )

        graph_million = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"),
            pynutil.insert("000", weight=0.1),
        )
        graph_billion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_trillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quadrillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_quintillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"),
            pynutil.insert("000", weight=0.1),
        )
        graph_sextillion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_sextillion
            + delete_space
            + graph_quintillion
            + delete_space
            + graph_quadrillion
            + delete_space
            + graph_trillion
            + delete_space
            + graph_billion
            + delete_space
            + graph_million
            + delete_space
            + graph_thousands
            + delete_space
            + graph_hundred_component,
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
        )

        labels_exception = [num_to_word(x) for x in range(0, 13)]
        graph_exception = pynini.union(*labels_exception)

        graph = (
            pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA)
            @ (NEMO_ALPHA + NEMO_SIGMA)
            @ graph
        )

        self.graph_no_exception = graph

        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1
        )

        final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 30
0
 def _make_analyzer(self) -> None:
     """Helper function for constructing analyzer."""
     self._analyzer = pynini.project(self._stems_to_forms, "output")
     self._analyzer @= self._deleter
     self._analyzer.invert().optimize()