Exemplos de union em Python, exemplos de pynini.union em Python

Exemplo n.º 1

0

Exibir arquivo

    def __init__(self):
        super().__init__(name="date", kind="classify")

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
        graph_twenties = pynini.string_file(
            get_abs_path("data/numbers/twenties.tsv"))

        graph_1_to_100 = pynini.union(
            graph_digit,
            graph_twenties,
            graph_teen,
            (graph_ties + pynutil.insert("0")),
            (graph_ties + pynutil.delete(" y ") + graph_digit),
        )

        digits_1_to_31 = [str(digits) for digits in range(1, 32)]
        graph_1_to_31 = graph_1_to_100 @ pynini.union(*digits_1_to_31)
        # can use "primero" for 1st day of the month
        graph_1_to_31 = pynini.union(graph_1_to_31,
                                     pynini.cross("primero", "1"))

        day_graph = pynutil.insert("day: \"") + graph_1_to_31 + pynutil.insert(
            "\"")

        month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
        month_graph = pynutil.insert(
            "month: \"") + month_graph + pynutil.insert("\"")

        graph_dm = day_graph + delete_space + pynutil.delete(
            "de") + delete_extra_space + month_graph

        final_graph = graph_dm
        final_graph += pynutil.insert(" preserve_order: true")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: graph_utils.py Projeto: quuhua911/NeMo

def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Applies gender conversion rules to a cardinal string. These include: rendering all masculine forms of "uno" (including apocopated forms) as "una" and
    Converting all gendered numbers in the hundreds series (200,300,400...) to feminine equivalent (e.g. "doscientos" -> "doscientas"). Conversion only applies
    to value place for <1000 and multiple of 1000. (e.g. "doscientos mil doscientos" -> "doscientas mil doscientas".) For place values greater than the thousands, there
    is no gender shift as the higher powers of ten ("millones", "billones") are masculine nouns and any conversion would be formally
    ungrammatical.
    e.g.
        "doscientos" -> "doscientas"
        "doscientos mil" -> "doscientas mil"
        "doscientos millones" -> "doscientos millones"
        "doscientos mil millones" -> "doscientos mil millones"
        "doscientos millones doscientos mil doscientos" -> "doscientos millones doscientas mil doscientas"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    before_mil = (NEMO_SPACE +
                  (pynini.accep("mil") | pynini.accep("milésimo")) +
                  pynini.closure(NEMO_SPACE + hundreds, 0, 1) +
                  pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) +
                  pynini.union(pynini.accep("[EOS]"), pynini.accep("\""),
                               decimal_separator))
    before_double_digits = pynini.closure(
        NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
            pynini.accep("[EOS]"), pynini.accep("\""))

    fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil,
                                  NEMO_SIGMA)  # doscientas mil dosciento
    fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits,
                                   NEMO_SIGMA)  # doscientas mil doscienta

    fem_allign @= pynini.cdrewrite(
        fem_ones, "", pynini.union("[EOS]", "\"",
                                   decimal_separator), NEMO_SIGMA
    )  # If before a quote or EOS, we know it's the end of a string

    return fst @ fem_allign

Exemplo n.º 3

0

Exibir arquivo

Arquivo: edit_transducer.py Projeto: pombredanne/pynini-learn

    def __init__(self,
                 alphabet,
                 insert_cost=DEFAULT_INSERT_COST,
                 delete_cost=DEFAULT_DELETE_COST,
                 substitute_cost=DEFAULT_SUBSTITUTE_COST):
        """Constructor.

    Args:
      alphabet: edit alphabet (an iterable of strings).
      insert_cost: the cost for the insertion operation.
      delete_cost: the cost for the deletion operation.
      substitute_cost: the cost for the substitution operation.
    """
        # Left factor; note that we divide the edit costs by two because they also
        # will be incurred when traversing the right factor.
        match = union(*alphabet).optimize(True)
        i_insert = transducer("",
                              "[{}]".format(self.INSERT),
                              weight=insert_cost / 2).optimize(True)
        i_delete = transducer(match,
                              "[{}]".format(self.DELETE),
                              weight=delete_cost / 2).optimize(True)
        i_substitute = transducer(match,
                                  "[{}]".format(self.SUBSTITUTE),
                                  weight=substitute_cost / 2).optimize(True)
        i_ops = union(match, i_insert, i_delete, i_substitute).optimize(True)
        # Right factor; this is constructed by inverting the left factor (i.e.,
        # swapping the input and output labels), then swapping the insert and delete
        # labels on what is now the input side.
        o_ops = invert(i_ops)
        syms = o_ops.input_symbols()
        insert_label = syms.find(self.INSERT)
        delete_label = syms.find(self.DELETE)
        o_ops.relabel_pairs(ipairs=((insert_label, delete_label),
                                    (delete_label, insert_label)))
        # Computes the closure for both sets of ops.
        self._e_i = i_ops.closure().optimize(True)
        self._e_o = o_ops.closure().optimize(True)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: phon_fst.py Projeto: wrznr/timur

    def __construct_r21(self):
        '''
    Low to up

    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map(["<NoHy>", "<NoDef>"]).project("input"))

            self.__syms.to_upper.draw("to_upper.dot")
            # Construction in SFST involves negation (which is expensiv).
            # It looks like we can do better:
            return pynini.push(pynini.union(
                alphabet.closure(),
                pynini.concat(
                    pynini.cross("<^UC>", "").closure(1),
                    pynini.union(
                        pynini.string_map(["<NoHy>",
                                           "<NoDef>"]).project("input"),
                        self.__syms.to_upper))).closure(),
                               push_labels=True).optimize()

Exemplo n.º 5

0

Exibir arquivo

Arquivo: phon_fst.py Projeto: wrznr/timur

    def __construct_r20(self):
        '''
    Up to low

    '''
        with pynini.default_token_type(self.__syms.alphabet):

            alphabet = pynini.union(
                self.__syms.characters,
                pynini.string_map(["<^UC>", "<NoHy>",
                                   "<NoDef>"]).project("input"))

            #
            # SFST uses a rewrite rule here
            return pynini.push(pynini.union(
                alphabet.closure(),
                pynini.concat(
                    pynini.cross("<CB>", "").closure(1),
                    pynini.union(
                        pynini.string_map(["<^UC>", "<NoHy>",
                                           "<NoDef>"]).project("input"),
                        self.__syms.to_lower))).closure(),
                               push_labels=True).optimize()

Exemplo n.º 6

0

Exibir arquivo

    def __init__(self, cardinal: GraphFst):
        super().__init__(name="ordinal", kind="classify")

        cardinal_graph = cardinal.graph_no_exception
        graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
        graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
        graph = pynini.closure(NEMO_CHAR) + pynini.union(
            graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "")
        )

        self.graph = graph @ cardinal_graph
        final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: paradigms.py Projeto: yzhang123/pynini

def build_stem_ids(min_id: int, max_id: int) -> pynini.Fst:
    """Builds the set of stem IDs.

  These are strings of the form __n__, for n in the range [min_id, max_id).

  Args:
    min_id: minimum stem ID (inclusive).
    max_id: maximum stem ID (exclusive).

  Returns:
    FST representing the stem IDs in [min_id, max_id) as strings.
  """
    return pynini.union(*["__{}__".format(i)
                          for i in range(min_id, max_id)]).optimize()

Exemplo n.º 8

0

Exibir arquivo

    def __init__(self, syms, sublexica, deko_filter, inflection, phon):

        #
        # store alphabet
        self.__syms = syms

        #
        # run parts of morphology building (cf. timur_fst)
        tmp = (sublexica.verbal_pref_stems + sublexica.base_stems
               ) * sublexica.nodef_to_null * deko_filter.pref_filter
        tmp = (sublexica.base_stems | tmp) * deko_filter.compound_filter

        # ANY TODO: Move to symbols!
        alphabet = pynini.union(
            syms.characters, syms.stem_types,
            pynini.string_map([
                "<FB>", "<SS>", "<n>", "<~n>", "<e>", "<d>", "<Ge-Nom>",
                "<UL>", "<NoHy>", "<NoDef>", "<ge>", "<Ge>", "<no-ge>", "<CB>"
            ],
                              input_token_type=syms.alphabet,
                              output_token_type=syms.alphabet).project()
        ).closure().optimize()

        tmp = (tmp + inflection.inflection) * (
            alphabet + inflection.inflection_filter
        ) * deko_filter.infix_filter * deko_filter.uplow

        tmp = pynini.compose(
            pynini.concat(
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
                tmp,
                pynini.transducer("",
                                  "<WB>",
                                  output_token_type=self.__syms.alphabet),
            ), phon.phon).optimize()

        #
        # default stems

        # create a default composition stem for nouns
        self.__compound_stems_nn = self.__construct_compound_stems_nn(tmp)

        # create a deriv stem for Ge nominalization (Gelerne)
        self.__ge_nom_stems_v = self.__construct_ge_nom_stems_v(tmp)

        # create an adjective base stem from participles
        self.__participle_adj = self.__construct_participle_adj(tmp, sublexica)
        self.__participle_adj.draw("participle_adj.dot", portrait=True)

Exemplo n.º 9

0

Exibir arquivo

    def __construct_r19(self):
        '''
    Eliminate markers
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                ["<CB>", "<^UC>", "<NoHy>", "<NoDef>"],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project())

        return pynini.union(
            alphabet,
            pynini.transducer(
                pynini.string_map(
                    [
                        "<DEL-S>", "<SS>", "<FB>", "<^Gen>", "<^Del>", "<^pl>",
                        "<^Ax>", "<WB>"
                    ],
                    input_token_type=self.__syms.alphabet,
                    output_token_type=self.__syms.alphabet).project(),
                "")).closure().optimize()

Exemplo n.º 10

0

Exibir arquivo

  def __construct_imperative_filter(self):
    '''
    Imperatives have no separable prefixes
    '''
    with pynini.default_token_type(self.__syms.alphabet):

      alphabet = pynini.union(
          self.__syms.characters,
          pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input")
          ).optimize()

      c2 = pynini.union(
          alphabet,
          pynini.cross(self.__syms.stem_types, "<CB>")
          ).closure().optimize()

      return pynini.union(
          c2,
          pynini.cross("<Base_Stems>", "<CB>")
          + alphabet.closure()
          + pynini.cross("<^imp>", "")
          + alphabet.closure()
          ).optimize()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: electronic.py Projeto: manneh/NeMo

    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("dot", ".")
        username = (pynutil.insert("username: \"") + alpha_num +
                    pynini.closure(delete_extra_space + accepted_username) +
                    pynutil.insert("\""))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        domain_graph = (pynutil.insert("domain: \"") + server +
                        delete_extra_space + process_dot + delete_extra_space +
                        domain + pynutil.insert("\""))
        graph = username + delete_extra_space + pynutil.delete(
            "at") + insert_space + delete_extra_space + domain_graph

        ############# url ###
        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
        protocol_start = (pynini.cross("h t t p", "http") | pynini.cross(
            "h t t p s", "https")) + pynini.cross(" colon slash slash ", "://")
        # .com,
        ending = (delete_extra_space + symbols + delete_extra_space +
                  (domain
                   | pynini.closure(accepted_username + delete_extra_space, ) +
                   accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot +
                    pynini.closure(delete_extra_space + accepted_username, 1) +
                    pynini.closure(ending, 1))
        protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert(
            "\"")
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()

Exemplo n.º 12

0

Exibir arquivo

    def __construct_imperative_filter(self):
        '''
    Imperatives have no separable prefixes
    '''

        alphabet = pynini.union(
            self.__syms.characters,
            pynini.string_map(
                [
                    "<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>",
                    "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>",
                    "<Up#>", "<Fix#>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>",
                    "<^Del>"
                ],
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet).project()).optimize()

        c2 = pynini.union(
            alphabet,
            pynini.transducer(
                self.__syms.stem_types,
                "<CB>",
                input_token_type=self.__syms.alphabet,
                output_token_type=self.__syms.alphabet)).closure().optimize()

        return pynini.union(
            c2,
            pynini.concat(
                pynini.transducer("<Base_Stems>",
                                  "<CB>",
                                  input_token_type=self.__syms.alphabet,
                                  output_token_type=self.__syms.alphabet),
                alphabet.closure(),
                pynini.transducer("<^imp>",
                                  "",
                                  input_token_type=self.__syms.alphabet),
                alphabet.closure())).optimize()

Exemplo n.º 13

0

Exibir arquivo

def _get_year_graph(deterministic: bool = True):
    """
    Transducer for year, only from 1000 - 2999 e.g.
    1290 -> twelve nineteen
    2000 - 2009 will be verbalized as two thousand.
    """
    graph = get_hundreds_graph(deterministic)
    graph = (
        pynini.union("1", "2")
        + NEMO_DIGIT
        + NEMO_DIGIT
        + NEMO_DIGIT
        + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1)
    ) @ graph
    return graph

Exemplo n.º 14

0

Exibir arquivo

    def __init__(self, cardinal: GraphFst, deterministic=False):
        super().__init__(name="ordinal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        endings = ["ter", "tes", "tem", "te", "ten"]
        self.graph = (
            (pynini.closure(NEMO_DIGIT | pynini.accep(".")) + pynutil.delete(
                pynutil.add_weight(pynini.union(*endings), weight=0.0001)
                | pynini.accep("."))) @ cardinal_graph).optimize()
        final_graph = pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()

Exemplo n.º 15

0

Exibir arquivo

 def setUpClass(cls):
     super().setUpClass()
     # Not clear "aspect" is exactly the right concept.
     aspect = features.Feature("aspect", "root", "dubitative", "gerundial",
                               "durative")
     verb = features.Category(aspect)
     root = features.FeatureVector(verb, "aspect=root")
     stem = paradigms.make_byte_star_except_boundary()
     # Naming these with short names for space reasons.
     vowels = ("a", "i", "o", "u")
     v = pynini.union(*vowels)
     c = pynini.union("c", "m", "h", "l", "y", "k", "ʔ", "d", "n", "w", "t")
     # First template: apply Procrustean transformation to CVCC^?.
     cvcc = (c + v + pynutil.delete(v).ques + c + pynutil.delete(v).star +
             c.ques).optimize()
     # Second template: apply Procrustean transformation to CVCVVC^?. The
     # CVCVVC^? case involves copying vowels, which is most easily achieved by
     # iterating over the vowels in the construction.
     cvcvvc = pynini.Fst()
     for v in vowels:
         cvcvvc.union(c + v + pynutil.delete(v).ques + c +
                      pynutil.delete(v).star + pynutil.insert(v + v) +
                      c.ques)
     cvcvvc.optimize()
     slots = [(stem, root),
              (paradigms.suffix("+al", stem),
               features.FeatureVector(verb, "aspect=dubitative")),
              (paradigms.suffix("+inay", stem @ cvcc),
               features.FeatureVector(verb, "aspect=gerundial")),
              (paradigms.suffix("+ʔaa", stem @ cvcvvc),
               features.FeatureVector(verb, "aspect=durative"))]
     cls.paradigm = paradigms.Paradigm(
         category=verb,
         slots=slots,
         lemma_feature_vector=root,
         stems=["caw", "cuum", "hoyoo", "diiyl", "ʔilk", "hiwiit"])

Exemplo n.º 16

0

Exibir arquivo

    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="classify",
                         deterministic=deterministic)

        cardinal_graph = cardinal.graph
        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        self.graph = ((pynini.closure(NEMO_DIGIT | pynini.accep(",")) +
                       pynutil.delete(pynini.union(*endings)))
                      @ cardinal_graph).optimize()
        final_graph = pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()

Exemplo n.º 17

0

Exibir arquivo

    def __init__(self,
                 alphabet: Iterable[str],
                 insert_cost: float = DEFAULT_INSERT_COST,
                 delete_cost: float = DEFAULT_DELETE_COST,
                 substitute_cost: float = DEFAULT_SUBSTITUTE_COST):
        """Constructor.

    Args:
      alphabet: edit alphabet (an iterable of strings).
      insert_cost: the cost for the insertion operation.
      delete_cost: the cost for the deletion operation.
      substitute_cost: the cost for the substitution operation.
    """
        # Left factor; note that we divide the edit costs by two because they also
        # will be incurred when traversing the right factor.
        match = pynini.union(*alphabet).optimize()
        i_insert = pynutil.insert(f"[{self.INSERT}]", weight=insert_cost / 2)
        i_delete = pynini.cross(
            match, pynini.accep(f"[{self.DELETE}]", weight=delete_cost / 2))
        i_substitute = pynini.cross(
            match,
            pynini.accep(f"[{self.SUBSTITUTE}]", weight=substitute_cost / 2))
        i_ops = pynini.union(match, i_insert, i_delete,
                             i_substitute).optimize()
        # Right factor; this is constructed by inverting the left factor (i.e.,
        # swapping the input and output labels), then swapping the insert and delete
        # labels on what is now the input side.
        o_ops = pynini.invert(i_ops)
        syms = pynini.generated_symbols()
        insert_label = syms.find(self.INSERT)
        delete_label = syms.find(self.DELETE)
        pairs = [(insert_label, delete_label), (delete_label, insert_label)]
        o_ops.relabel_pairs(ipairs=pairs)
        # Computes the closure for both sets of ops.
        self._e_i = i_ops.closure().optimize()
        self._e_o = o_ops.closure().optimize()

Exemplo n.º 18

0

Exibir arquivo

    def token_lattice(self, token: str) -> pynini.Fst:
        """Constructs a "link" of the lattice for a given token.

    Args:
      token: An input token.

    Returns:
      An FST "link".
    """
        return pynini.union(
            self._deduplicator.expand(token),
            self._deabbreviator.expand(token),
            self._regexps.expand(token),
            self._lexicon.expand(token),
        ).optimize()

Exemplo n.º 19

0

Exibir arquivo

 def _get_thousands_graph():
     graph_ties = _get_ties_graph()
     graph_hundred_component = (
         (graph_digit | graph_zero) + delete_space + pynutil.delete("trăm")
     ) | pynutil.insert("0")
     graph = (
         graph_digit
         + delete_space
         + pynutil.delete(pynini.union("nghìn", "ngàn"))
         + delete_space
         + graph_hundred_component
         + delete_space
         + (graph_teen | graph_ties | graph_digits)
     )
     return graph

Exemplo n.º 20

0

Exibir arquivo

Arquivo: graph_utils.py Projeto: quuhua911/NeMo

def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
    affects strings where the final value is a variation of "un".
    e.g.
        "un" -> "uno"
        "veintiún" -> "veintiuno"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    # Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
    strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""),
                             NEMO_SIGMA)
    return fst @ strip

Exemplo n.º 21

0

Exibir arquivo

Arquivo: graph_utils.py Projeto: quuhua911/NeMo

def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
    with text string, so is included for non-deterministic cases.
    e.g.
        "una" -> "un"
        "veintiuna" -> "veintiun"

    Args:
        fst: Any fst. Composes conversion onto fst's output strings
    """
    # Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
    strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
    strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""),
                             NEMO_SIGMA)
    return fst @ strip

Exemplo n.º 22

0

Exibir arquivo

 def __suff_stems_filter(self, features):
     '''
 Return a union over filters for each feature given
 '''
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.acceptor("<Suff_Stems>",
                                  token_type=self.__syms.alphabet)
     for feature in features:
         to_eps = pynini.transducer(feature,
                                    "",
                                    input_token_type=self.__syms.alphabet)
         filtering = pynini.union(filtering,
                                  pynini.concat(to_eps, suff_stems, to_eps))
     return filtering.optimize()

Exemplo n.º 23

0

Exibir arquivo

def _get_range_graph():
    """
    Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
    """
    graph_ties = _get_ties_graph()
    graph = (graph_ties | graph_teen) + delete_space + pynini.cross("hundreds", "00s")
    graph |= pynini.cross("two", "2") + delete_space + pynini.cross("thousands", "000s")
    graph |= (
        (graph_ties | graph_teen)
        + delete_space
        + (pynini.closure(NEMO_ALPHA, 1) + (pynini.cross("ies", "y") | pynutil.delete("s")))
        @ (graph_ties | pynini.cross("ten", "10"))
        + pynutil.insert("s")
    )
    graph @= pynini.union("1", "2") + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + "s"
    return graph

Exemplo n.º 24

0

Exibir arquivo

 def __suff_stems_filter(self, features):
   '''
   Return a union over filters for each feature given
   '''
   with pynini.default_token_type(self.__syms.alphabet):
     filtering = pynini.Fst()
     filtering.set_input_symbols(self.__syms.alphabet)
     filtering.set_output_symbols(self.__syms.alphabet)
     suff_stems = pynini.accep("<Suff_Stems>")
     for feature in features:
       to_eps = pynini.cross(feature, "")
       filtering = pynini.union(
           filtering,
           to_eps + suff_stems + to_eps
           )
     return filtering.optimize()

Exemplo n.º 25

0

Exibir arquivo

Arquivo: features.py Projeto: yzhang123/pynini

  def _make_feature_mapper(self) -> pynini.Fst:
    r"""Convenience function generating a map to human-readable strings.

    Returns:
      A transducer that maps from internal symbols like "[case=nom]" to a
      sequence that will be readable as a string ("\[case=nom\]") for all
      feature-value combinations.
    """
    pairs = []
    for feature in self._features:
      name = feature.name
      for value in feature.values:
        f = f"[{name}={value}]"
        v = pynini.escape(f"[{name}={value}]")
        pairs.append(pynini.cross(f, v))
    return pynini.union(*pairs).closure().optimize()

Exemplo n.º 26

0

Exibir arquivo

    def __init__(self):
        super().__init__(name="ordinal", kind="verbalize")
        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))

        replace_suffix = pynini.union(
            pynini.cross(" morphosyntactic_features: \"o\"", ".º"),
            pynini.cross(" morphosyntactic_features: \"a\"", ".ª"),
            pynini.cross(" morphosyntactic_features: \"er\"", ".ᵉʳ"),
        )

        graph = graph + replace_suffix

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()

Exemplo n.º 27

0

Exibir arquivo

Arquivo: telephone.py Projeto: serkhanekarim/marketing-analysis

    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()

Exemplo n.º 28

0

Exibir arquivo

def _get_ties_graph():
    """
    Transducer for 20-99 e.g
    hai ba -> 23
    """
    graph_one = pynini.cross("mốt", "1")
    graph_four = pynini.cross("tư", "4")
    graph_five = pynini.cross("lăm", "5")
    graph_ten = pynini.cross("mươi", "")
    optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)

    graph = pynini.union(
        ties_graph + optional_ten + delete_space +
        (graph_digit | graph_one | graph_four | graph_five),
        ties_graph + delete_space + graph_ten + pynutil.insert("0"),
    )
    return graph

Exemplo n.º 29

0

Exibir arquivo

Arquivo: word.py Projeto: serkhanekarim/marketing-analysis

    def __init__(self, deterministic: bool = True):
        super().__init__(name="word",
                         kind="classify",
                         deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(
            pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph,
                    pynini.union("$", "€", "₩", "£", "¥") +
                    pynini.closure(NEMO_DIGIT, 1)), 1)

        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()

Exemplo n.º 30

0

Exibir arquivo

    def __init__(self, cardinal, deterministic: bool = True):
        super().__init__(name="fraction", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph

        integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + pynini.accep(" ")
        numerator = (
            pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))
        )

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1)

        denominator = pynutil.insert("denominator: \"") + cardinal_graph + optional_end + pynutil.insert("\"")

        self.graph = pynini.closure(integer, 0, 1) + numerator + denominator
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()