Exemplo n.º 1
0
    def __init__(self):
        super().__init__(name="date", kind="classify")

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))

        graph_1_to_100 = pynini.union(
            graph_digit,
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + pynutil.delete(" y ") + graph_digit),
        )

        digits_1_to_31 = [str(digits) for digits in range(1, 32)]
        graph_1_to_31 = graph_1_to_100 @ pynini.union(*digits_1_to_31)
        # can use "primero" for 1st day of the month
        graph_1_to_31 = pynini.union(graph_1_to_31,
                                     pynini.cross("primero", "1"))

        day_graph = pynutil.insert("day: \"") + graph_1_to_31 + pynutil.insert(
            "\"")

        month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        graph_dm = day_graph + delete_space + pynutil.delete(
            "de") + delete_extra_space + month_graph

        final_graph = graph_dm
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 2
0
def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Applies gender conversion rules to a cardinal string. These include: rendering all masculine forms of "uno" (including apocopated forms) as "una" and
    Converting all gendered numbers in the hundreds series (200,300,400...) to feminine equivalent (e.g. "doscientos" -> "doscientas"). Conversion only applies
    to value place for <1000 and multiple of 1000. (e.g. "doscientos mil doscientos" -> "doscientas mil doscientas".) For place values greater than the thousands, there
    is no gender shift as the higher powers of ten ("millones", "billones") are masculine nouns and any conversion would be formally
    ungrammatical.
    e.g.
        "doscientos" -> "doscientas"
        "doscientos mil" -> "doscientas mil"
        "doscientos millones" -> "doscientos millones"
        "doscientos mil millones" -> "doscientos mil millones"
        "doscientos millones doscientos mil doscientos" -> "doscientos millones doscientas mil doscientas"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    before_mil = (NEMO_SPACE +
                  (pynini.accep("mil") | pynini.accep("milésimo")) +
                  pynini.closure(NEMO_SPACE + hundreds, 0, 1) +
                  pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) +
                  pynini.union(pynini.accep("[EOS]"), pynini.accep("\""),
                               decimal_separator))
    before_double_digits = pynini.closure(
        NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
            pynini.accep("[EOS]"), pynini.accep("\""))

    fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil,
                                  NEMO_SIGMA)  # doscientas mil dosciento
    fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits,
                                   NEMO_SIGMA)  # doscientas mil doscienta

    fem_allign @= pynini.cdrewrite(
        fem_ones, "", pynini.union("[EOS]", "\"",
                                   decimal_separator), NEMO_SIGMA
    )  # If before a quote or EOS, we know it's the end of a string

    return fst @ fem_allign
Exemplo n.º 3
0
    def __init__(self,
                 alphabet,
                 insert_cost=DEFAULT_INSERT_COST,
                 delete_cost=DEFAULT_DELETE_COST,
                 substitute_cost=DEFAULT_SUBSTITUTE_COST):
        """Constructor.

    Args:
      alphabet: edit alphabet (an iterable of strings).
      insert_cost: the cost for the insertion operation.
      delete_cost: the cost for the deletion operation.
      substitute_cost: the cost for the substitution operation.
    """
        # Left factor; note that we divide the edit costs by two because they also
        # will be incurred when traversing the right factor.
        match = union(*alphabet).optimize(True)
        i_insert = transducer("",
                              "[{}]".format(self.INSERT),
                              weight=insert_cost / 2).optimize(True)
        i_delete = transducer(match,
                              "[{}]".format(self.DELETE),
                              weight=delete_cost / 2).optimize(True)
        i_substitute = transducer(match,
                                  "[{}]".format(self.SUBSTITUTE),
                                  weight=substitute_cost / 2).optimize(True)
        i_ops = union(match, i_insert, i_delete, i_substitute).optimize(True)
        # Right factor; this is constructed by inverting the left factor (i.e.,
        # swapping the input and output labels), then swapping the insert and delete
        # labels on what is now the input side.
        o_ops = invert(i_ops)
        syms = o_ops.input_symbols()
        insert_label = syms.find(self.INSERT)
        delete_label = syms.find(self.DELETE)
        o_ops.relabel_pairs(ipairs=((insert_label, delete_label),
                                    (delete_label, insert_label)))
        # Computes the closure for both sets of ops.
        self._e_i = i_ops.closure().optimize(True)
        self._e_o = o_ops.closure().optimize(True)
Exemplo n.º 4
0
    def __construct_r21(self):
        '''
    Low to up

    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map(["<NoHy>", "<NoDef>"]).project("input"))

            self.__syms.to_upper.draw("to_upper.dot")
            # Construction in SFST involves negation (which is expensiv).
            # It looks like we can do better:
            return pynini.push(pynini.union(
                alphabet.closure(),
                pynini.concat(
                    pynini.cross("<^UC>", "").closure(1),
                    pynini.union(
                        pynini.string_map(["<NoHy>",
                                           "<NoDef>"]).project("input"),
                        self.__syms.to_upper))).closure(),
                               push_labels=True).optimize()
Exemplo n.º 5
0
    def __construct_r20(self):
        '''
    Up to low

    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map(["<^UC>", "<NoHy>",
                                   "<NoDef>"]).project("input"))

            #
            # SFST uses a rewrite rule here
            return pynini.push(pynini.union(
                alphabet.closure(),
                pynini.concat(
                    pynini.cross("<CB>", "").closure(1),
                    pynini.union(
                        pynini.string_map(["<^UC>", "<NoHy>",
                                           "<NoDef>"]).project("input"),
                        self.__syms.to_lower))).closure(),
                               push_labels=True).optimize()
Exemplo n.º 6
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="ordinal", kind="classify")

        cardinal_graph = cardinal.graph_no_exception
        graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
        graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
        graph = pynini.closure(NEMO_CHAR) + pynini.union(
            graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "")
        )

        self.graph = graph @ cardinal_graph
        final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 7
0
def build_stem_ids(min_id: int, max_id: int) -> pynini.Fst:
    """Builds the set of stem IDs.

  These are strings of the form __n__, for n in the range [min_id, max_id).

  Args:
    min_id: minimum stem ID (inclusive).
    max_id: maximum stem ID (exclusive).

  Returns:
    FST representing the stem IDs in [min_id, max_id) as strings.
  """
    return pynini.union(*["__{}__".format(i)
                          for i in range(min_id, max_id)]).optimize()
Exemplo n.º 8
0
    def __init__(self, syms, sublexica, deko_filter, inflection, phon):

        #
        # store alphabet
        self.__syms = syms

        #
        # run parts of morphology building (cf. timur_fst)
        tmp = (sublexica.verbal_pref_stems + sublexica.base_stems
               ) * sublexica.nodef_to_null * deko_filter.pref_filter
        tmp = (sublexica.base_stems | tmp) * deko_filter.compound_filter

        # ANY TODO: Move to symbols!
        alphabet = pynini.union(
            syms.characters, syms.stem_types,
            pynini.string_map([
                "<FB>", "<SS>", "<n>", "<~n>", "<e>", "<d>", "<Ge-Nom>",
                "<UL>", "<NoHy>", "<NoDef>", "<ge>", "<Ge>", "<no-ge>", "<CB>"
            ],
                              input_token_type=syms.alphabet,
                              output_token_type=syms.alphabet).project()
        ).closure().optimize()

        tmp = (tmp + inflection.inflection) * (
            alphabet + inflection.inflection_filter
        ) * deko_filter.infix_filter * deko_filter.uplow

        tmp = pynini.compose(
            pynini.concat(
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
                tmp,
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
            ), phon.phon).optimize()

        #
        # default stems

        # create a default composition stem for nouns
        self.__compound_stems_nn = self.__construct_compound_stems_nn(tmp)

        # create a deriv stem for Ge nominalization (Gelerne)
        self.__ge_nom_stems_v = self.__construct_ge_nom_stems_v(tmp)

        # create an adjective base stem from participles
        self.__participle_adj = self.__construct_participle_adj(tmp, sublexica)
        self.__participle_adj.draw("participle_adj.dot", portrait=True)
Exemplo n.º 9
0
    def __construct_r19(self):
        '''
    Eliminate markers
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<CB>", "<^UC>", "<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        return pynini.union(
            alphabet,
            pynini.transducer(
                pynini.string_map(
                    [
                        "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>",
                        "<^Ax>", "<WB>"
                    ],
                    input_token_type=self.__syms.alphabet,
                    output_token_type=self.__syms.alphabet).project(),
                "")).closure().optimize()
Exemplo n.º 10
0
  def __construct_imperative_filter(self):
    '''
    Imperatives have no separable prefixes
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input")
          ).optimize()

      c2 = pynini.union(
          alphabet,
          pynini.cross(self.__syms.stem_types, "<CB>")
          ).closure().optimize()

      return pynini.union(
          c2,
          pynini.cross("<Base_Stems>", "<CB>")
          + alphabet.closure()
          + pynini.cross("<^imp>", "")
          + alphabet.closure()
          ).optimize()
Exemplo n.º 11
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("dot", ".")
        username = (pynutil.insert("username: \"") + alpha_num +
                    pynini.closure(delete_extra_space + accepted_username) +
                    pynutil.insert("\""))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain_graph = (pynutil.insert("domain: \"") + server +
                        delete_extra_space + process_dot + delete_extra_space +
                        domain + pynutil.insert("\""))
        graph = username + delete_extra_space + pynutil.delete(
            "at") + insert_space + delete_extra_space + domain_graph

        ############# url ###
        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
        protocol_start = (pynini.cross("h t t p", "http") | pynini.cross(
            "h t t p s", "https")) + pynini.cross(" colon slash slash ", "://")
        # .com,
        ending = (delete_extra_space + symbols + delete_extra_space +
                  (domain
                   | pynini.closure(accepted_username + delete_extra_space, ) +
                   accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot +
                    pynini.closure(delete_extra_space + accepted_username, 1) +
                    pynini.closure(ending, 1))
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 12
0
    def __construct_imperative_filter(self):
        '''
    Imperatives have no separable prefixes
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>",
                    "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(
            alphabet,
            pynini.transducer(
                self.__syms.stem_types,
                "<CB>",
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet)).closure().optimize()

        return pynini.union(
            c2,
            pynini.concat(
                pynini.transducer("<Base_Stems>",
                                  "<CB>",
                                  input_token_type=self.__syms.alphabet,
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^imp>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()
Exemplo n.º 13
0
def _get_year_graph(deterministic: bool = True):
    """
    Transducer for year, only from 1000 - 2999 e.g.
    1290 -> twelve nineteen
    2000 - 2009 will be verbalized as two thousand.
    """
    graph = get_hundreds_graph(deterministic)
    graph = (
        pynini.union("1", "2")
        + NEMO_DIGIT
        + NEMO_DIGIT
        + NEMO_DIGIT
        + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)
    ) @ graph
    return graph
Exemplo n.º 14
0
    def __init__(self, cardinal: GraphFst, deterministic=False):
        super().__init__(name="ordinal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        endings = ["ter", "tes", "tem", "te", "ten"]
        self.graph = (
            (pynini.closure(NEMO_DIGIT | pynini.accep(".")) + pynutil.delete(
                pynutil.add_weight(pynini.union(*endings), weight=0.0001)
                | pynini.accep("."))) @ cardinal_graph).optimize()
        final_graph = pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 15
0
 def setUpClass(cls):
     super().setUpClass()
     # Not clear "aspect" is exactly the right concept.
     aspect = features.Feature("aspect", "root", "dubitative", "gerundial",
                               "durative")
     verb = features.Category(aspect)
     root = features.FeatureVector(verb, "aspect=root")
     stem = paradigms.make_byte_star_except_boundary()
     # Naming these with short names for space reasons.
     vowels = ("a", "i", "o", "u")
     v = pynini.union(*vowels)
     c = pynini.union("c", "m", "h", "l", "y", "k", "ʔ", "d", "n", "w", "t")
     # First template: apply Procrustean transformation to CVCC^?.
     cvcc = (c + v + pynutil.delete(v).ques + c + pynutil.delete(v).star +
             c.ques).optimize()
     # Second template: apply Procrustean transformation to CVCVVC^?. The
     # CVCVVC^? case involves copying vowels, which is most easily achieved by
     # iterating over the vowels in the construction.
     cvcvvc = pynini.Fst()
     for v in vowels:
         cvcvvc.union(c + v + pynutil.delete(v).ques + c +
                      pynutil.delete(v).star + pynutil.insert(v + v) +
                      c.ques)
     cvcvvc.optimize()
     slots = [(stem, root),
              (paradigms.suffix("+al", stem),
               features.FeatureVector(verb, "aspect=dubitative")),
              (paradigms.suffix("+inay", stem @ cvcc),
               features.FeatureVector(verb, "aspect=gerundial")),
              (paradigms.suffix("+ʔaa", stem @ cvcvvc),
               features.FeatureVector(verb, "aspect=durative"))]
     cls.paradigm = paradigms.Paradigm(
         category=verb,
         slots=slots,
         lemma_feature_vector=root,
         stems=["caw", "cuum", "hoyoo", "diiyl", "ʔilk", "hiwiit"])
Exemplo n.º 16
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        self.graph = ((pynini.closure(NEMO_DIGIT | pynini.accep(",")) +
                       pynutil.delete(pynini.union(*endings)))
                      @ cardinal_graph).optimize()
        final_graph = pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 17
0
    def __init__(self,
                 alphabet: Iterable[str],
                 insert_cost: float = DEFAULT_INSERT_COST,
                 delete_cost: float = DEFAULT_DELETE_COST,
                 substitute_cost: float = DEFAULT_SUBSTITUTE_COST):
        """Constructor.

    Args:
      alphabet: edit alphabet (an iterable of strings).
      insert_cost: the cost for the insertion operation.
      delete_cost: the cost for the deletion operation.
      substitute_cost: the cost for the substitution operation.
    """
        # Left factor; note that we divide the edit costs by two because they also
        # will be incurred when traversing the right factor.
        match = pynini.union(*alphabet).optimize()
        i_insert = pynutil.insert(f"[{self.INSERT}]", weight=insert_cost / 2)
        i_delete = pynini.cross(
            match, pynini.accep(f"[{self.DELETE}]", weight=delete_cost / 2))
        i_substitute = pynini.cross(
            match,
            pynini.accep(f"[{self.SUBSTITUTE}]", weight=substitute_cost / 2))
        i_ops = pynini.union(match, i_insert, i_delete,
                             i_substitute).optimize()
        # Right factor; this is constructed by inverting the left factor (i.e.,
        # swapping the input and output labels), then swapping the insert and delete
        # labels on what is now the input side.
        o_ops = pynini.invert(i_ops)
        syms = pynini.generated_symbols()
        insert_label = syms.find(self.INSERT)
        delete_label = syms.find(self.DELETE)
        pairs = [(insert_label, delete_label), (delete_label, insert_label)]
        o_ops.relabel_pairs(ipairs=pairs)
        # Computes the closure for both sets of ops.
        self._e_i = i_ops.closure().optimize()
        self._e_o = o_ops.closure().optimize()
Exemplo n.º 18
0
    def token_lattice(self, token: str) -> pynini.Fst:
        """Constructs a "link" of the lattice for a given token.

    Args:
      token: An input token.

    Returns:
      An FST "link".
    """
        return pynini.union(
            self._deduplicator.expand(token),
            self._deabbreviator.expand(token),
            self._regexps.expand(token),
            self._lexicon.expand(token),
        ).optimize()
Exemplo n.º 19
0
 def _get_thousands_graph():
     graph_ties = _get_ties_graph()
     graph_hundred_component = (
         (graph_digit | graph_zero) + delete_space + pynutil.delete("trăm")
     ) | pynutil.insert("0")
     graph = (
         graph_digit
         + delete_space
         + pynutil.delete(pynini.union("nghìn", "ngàn"))
         + delete_space
         + graph_hundred_component
         + delete_space
         + (graph_teen | graph_ties | graph_digits)
     )
     return graph
Exemplo n.º 20
0
def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
    affects strings where the final value is a variation of "un".
    e.g.
        "un" -> "uno"
        "veintiún" -> "veintiuno"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    # Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
    strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""),
                             NEMO_SIGMA)
    return fst @ strip
Exemplo n.º 21
0
def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
    with text string, so is included for non-deterministic cases.
    e.g.
        "una" -> "un"
        "veintiuna" -> "veintiun"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    # Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
    strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""),
                             NEMO_SIGMA)
    return fst @ strip
Exemplo n.º 22
0
 def __suff_stems_filter(self, features):
     '''
 Return a union over filters for each feature given
 '''
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.acceptor("<Suff_Stems>",
                                  token_type=self.__syms.alphabet)
     for feature in features:
         to_eps = pynini.transducer(feature,
                                    "",
                                    input_token_type=self.__syms.alphabet)
         filtering = pynini.union(filtering,
                                  pynini.concat(to_eps, suff_stems, to_eps))
     return filtering.optimize()
Exemplo n.º 23
0
def _get_range_graph():
    """
    Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
    """
    graph_ties = _get_ties_graph()
    graph = (graph_ties | graph_teen) + delete_space + pynini.cross("hundreds", "00s")
    graph |= pynini.cross("two", "2") + delete_space + pynini.cross("thousands", "000s")
    graph |= (
        (graph_ties | graph_teen)
        + delete_space
        + (pynini.closure(NEMO_ALPHA, 1) + (pynini.cross("ies", "y") | pynutil.delete("s")))
        @ (graph_ties | pynini.cross("ten", "10"))
        + pynutil.insert("s")
    )
    graph @= pynini.union("1", "2") + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + "s"
    return graph
Exemplo n.º 24
0
 def __suff_stems_filter(self, features):
   '''
   Return a union over filters for each feature given
   '''
   with pynini.default_token_type(self.__syms.alphabet):
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.accep("<Suff_Stems>")
     for feature in features:
       to_eps = pynini.cross(feature, "")
       filtering = pynini.union(
           filtering,
           to_eps + suff_stems + to_eps
           )
     return filtering.optimize()
Exemplo n.º 25
0
  def _make_feature_mapper(self) -> pynini.Fst:
    r"""Convenience function generating a map to human-readable strings.

    Returns:
      A transducer that maps from internal symbols like "[case=nom]" to a
      sequence that will be readable as a string ("\[case=nom\]") for all
      feature-value combinations.
    """
    pairs = []
    for feature in self._features:
      name = feature.name
      for value in feature.values:
        f = f"[{name}={value}]"
        v = pynini.escape(f"[{name}={value}]")
        pairs.append(pynini.cross(f, v))
    return pynini.union(*pairs).closure().optimize()
Exemplo n.º 26
0
    def __init__(self):
        super().__init__(name="ordinal", kind="verbalize")
        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))

        replace_suffix = pynini.union(
            pynini.cross(" morphosyntactic_features: \"o\"", ".º"),
            pynini.cross(" morphosyntactic_features: \"a\"", ".ª"),
            pynini.cross(" morphosyntactic_features: \"er\"", ".ᵉʳ"),
        )

        graph = graph + replace_suffix

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 27
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 28
0
def _get_ties_graph():
    """
    Transducer for 20-99 e.g
    hai ba -> 23
    """
    graph_one = pynini.cross("mốt", "1")
    graph_four = pynini.cross("tư", "4")
    graph_five = pynini.cross("lăm", "5")
    graph_ten = pynini.cross("mươi", "")
    optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)

    graph = pynini.union(
        ties_graph + optional_ten + delete_space +
        (graph_digit | graph_one | graph_four | graph_five),
        ties_graph + delete_space + graph_ten + pynutil.insert("0"),
    )
    return graph
Exemplo n.º 29
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word",
                         kind="classify",
                         deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(
            pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph,
                    pynini.union("$", "€", "₩", "£", "¥") +
                    pynini.closure(NEMO_DIGIT, 1)), 1)

        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 30
0
    def __init__(self, cardinal, deterministic: bool = True):
        super().__init__(name="fraction", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph

        integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + pynini.accep(" ")
        numerator = (
            pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))
        )

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1)

        denominator = pynutil.insert("denominator: \"") + cardinal_graph + optional_end + pynutil.insert("\"")

        self.graph = pynini.closure(integer, 0, 1) + numerator + denominator
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()