예제 #1
0
파일: cardinal.py 프로젝트: quuhua911/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="verbalize",
                         deterministic=deterministic)
        optional_sign = pynini.closure(
            pynini.cross("negative: \"true\" ", "menos "), 0, 1)
        self.optional_sign = optional_sign

        integer = pynini.closure(NEMO_NOT_QUOTE, 1)
        self.integer = pynutil.delete(" \"") + integer + pynutil.delete("\"")

        integer = pynutil.delete("integer:") + self.integer

        graph_masc = optional_sign + integer
        graph_fem = shift_cardinal_gender(graph_masc)

        self.graph_masc = pynini.optimize(graph_masc)
        self.graph_fem = pynini.optimize(graph_fem)

        # Adding adjustment for fem gender (choice of gender will be random)
        graph = graph_masc | graph_fem

        if not deterministic:
            # For alternate renderings when apocope is omitted (i.e. cardinal stands alone)
            graph |= strip_cardinal_apocope(graph_masc)
            # "una" will drop to "un" in unique contexts
            graph |= add_cardinal_apocope_fem(graph_fem)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
예제 #2
0
def generator_main(exporter: grm.Exporter, token_type: pynini.TokenType):
    """FSTs for reading normalization of abjad / alphabet script languages."""
    with pynini.default_token_type(token_type):
        sigma = u.sigma_from_common_data_files()
        for lang in u.LANGS:
            visual_norm_fst = _open_visual(lang, token_type)
            reading_norm_file = u.LANG_DIR / lang / 'reading_norm.tsv'
            reading_norm_fst = rule.fst_from_rule_file(reading_norm_file,
                                                       sigma)
            lang = lang.upper()
            exporter[lang] = pynini.optimize(
                visual_norm_fst @ reading_norm_fst)
예제 #3
0
def fst_from_rules(rules: RuleSet, sigma: pynini.Fst) -> pynini.Fst:
    """Gets rewrite FST from given rule set representing rewrites.

  Args:
    rules: String rules representing a set of rewrites.
    sigma: Fst to consider the complete alphabet for CDRewrites.

  Returns:
    The Rewrite FST for the specified rule file.
  """

    fsts = [
        pynini.optimize(pynini.string_map(rule_set))
        for rule_set in partition_unordered(rules)
    ]
    return ur.RewriteAndComposeFsts(fsts, sigma)
예제 #4
0
def generator_main(exporter: grm.Exporter):
    """FSTs for language-agnostic reversible romanization of abjad/alphabets."""
    # Construct NFC transducer - it is different from the standalone FST
    # transducer in that it allows letters that are not abjad / alphabet.
    nfc_file = u.LANG_DIR / 'nfc.tsv'
    nfc_fst = rule.fst_from_rule_file(nfc_file, byte.BYTE)

    # Build language-agnostic visual normalization transducer.
    visual_norm_file = u.LANG_DIR / 'common' / 'visual_norm.tsv'
    visual_norm_fst = rule.fst_from_rule_file(visual_norm_file, byte.BYTE)

    # Compile romanisation transducer. In the Latin direction we apply NFC and
    # visual normalization first. No visual normalization is required in the
    # opposite direction.
    roman_mapping_file = u.LANG_DIR / 'reversible_roman.tsv'
    roman_fst = rule.fst_from_rule_file(roman_mapping_file, byte.BYTE)
    exporter['FROM_ARAB'] = pynini.optimize(
        nfc_fst @ visual_norm_fst @ roman_fst)

    # Transforming Latin to native is simpler.
    roman_strings = f.StringFile(roman_mapping_file)
    roman_inv_fst = pynini.invert(roman_strings).star
    exporter['TO_ARAB'] = roman_inv_fst.optimize()
예제 #5
0
def Rewrite(rule: pynini.FstLike,
            sigma: pynini.Fst = byte.BYTE,
            left: pynini.FstLike = "",
            right: pynini.FstLike = "") -> pynini.Fst:
    return pynini.optimize(pynini.cdrewrite(rule, left, right, sigma.star))
예제 #6
0
파일: fraction.py 프로젝트: quuhua911/NeMo
    def __init__(self, deterministic: bool = True):
        super().__init__(name="fraction",
                         kind="verbalize",
                         deterministic=deterministic)

        # Derivational strings append 'avo' as a suffix. Adding space for processing aid
        fraction_stem = pynutil.insert(" avo")
        plural = pynutil.insert("s")
        conjunction = pynutil.insert(" y ")

        integer = (pynutil.delete("integer_part: \"") +
                   strip_cardinal_apocope(pynini.closure(NEMO_NOT_QUOTE)) +
                   pynutil.delete("\""))

        numerator_one = pynutil.delete("numerator: \"") + pynini.accep(
            "un") + pynutil.delete("\" ")
        numerator = (pynutil.delete("numerator: \"") +
                     pynini.difference(pynini.closure(NEMO_NOT_QUOTE), "un") +
                     pynutil.delete("\" "))

        denominator_add_stem = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) + fraction_stem +
            pynutil.delete("\" morphosyntactic_features: \"add_root\""))
        denominator_ordinal = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) +
            pynutil.delete("\" morphosyntactic_features: \"ordinal\""))
        denominator_cardinal = pynutil.delete("denominator: \"") + (
            pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\""))

        denominator_singular = pynini.union(denominator_add_stem,
                                            denominator_ordinal)
        if not deterministic:
            # Occasional exceptions
            denominator_singular |= denominator_add_stem @ pynini.string_map(
                [("once avo", "undécimo"), ("doce avo", "duodécimo")])
        denominator_plural = denominator_singular + plural

        # Merging operations
        merge = pynini.cdrewrite(
            pynini.cross(" y ", "i"), "", "", NEMO_SIGMA
        )  # The denominator must be a single word, with the conjunction "y" replaced by i
        merge @= pynini.cdrewrite(delete_space, "",
                                  pynini.difference(NEMO_CHAR, "parte"),
                                  NEMO_SIGMA)

        # The merger can produce duplicate vowels. This is not allowed in orthography
        delete_duplicates = pynini.string_map([("aa", "a"),
                                               ("oo", "o")])  # Removes vowels
        delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "",
                                             NEMO_SIGMA)

        remove_accents = pynini.cdrewrite(
            accents,
            pynini.union(NEMO_SPACE, pynini.accep("[BOS]")) +
            pynini.closure(NEMO_NOT_SPACE),
            pynini.closure(NEMO_NOT_SPACE) +
            pynini.union("avo", "ava", "ésimo", "ésima"),
            NEMO_SIGMA,
        )
        merge_into_single_word = merge @ remove_accents @ delete_duplicates

        fraction_default = numerator + delete_space + insert_space + (
            denominator_plural @ merge_into_single_word)

        fraction_with_one = (numerator_one + delete_space + insert_space +
                             (denominator_singular @ merge_into_single_word))

        fraction_with_cardinal = strip_cardinal_apocope(numerator
                                                        | numerator_one)
        fraction_with_cardinal += (
            delete_space + pynutil.insert(" sobre ") +
            strip_cardinal_apocope(denominator_cardinal))

        if not deterministic:
            # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine
            # Other rules will manage use of "un" at end, so just worry about endings
            exceptions = pynini.string_map([("tercia", "tercera")])
            apply_exceptions = pynini.cdrewrite(exceptions, "", "", NEMO_SIGMA)
            vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "",
                                            pynini.accep("[EOS]"), NEMO_SIGMA)

            denominator_singular_fem = shift_cardinal_gender(
                denominator_singular) @ vowel_change @ apply_exceptions
            denominator_plural_fem = denominator_singular_fem + plural

            numerator_one_fem = shift_cardinal_gender(numerator_one)
            numerator_fem = shift_cardinal_gender(numerator)

            fraction_with_cardinal |= (
                (numerator_one_fem | numerator_fem) + delete_space +
                pynutil.insert(" sobre ") +
                shift_cardinal_gender(denominator_cardinal))

            # Still need to manage stems
            merge_stem = pynini.cdrewrite(
                delete_space, "", pynini.union("avo", "ava", "avos", "avas"),
                NEMO_SIGMA)  # For managing alternative spacing
            merge_stem @= remove_accents @ delete_duplicates

            fraction_with_one_fem = numerator_one_fem + delete_space + insert_space
            fraction_with_one_fem += pynini.union(
                denominator_singular_fem @ merge_stem, denominator_singular_fem
                @ merge_into_single_word)  # Both forms exists
            fraction_with_one_fem += pynutil.insert(" parte")
            fraction_with_one_fem @= pynini.cdrewrite(
                pynini.cross("una media", "media"), "", "",
                NEMO_SIGMA)  # "media" not "una media"

            fraction_default_fem = numerator_fem + delete_space + insert_space
            fraction_default_fem += pynini.union(
                denominator_plural_fem @ merge_stem,
                denominator_plural_fem @ merge_into_single_word)
            fraction_default_fem += pynutil.insert(" partes")

            fraction_default |= (numerator + delete_space + insert_space +
                                 denominator_plural @ merge_stem
                                 )  # Case of no merger
            fraction_default |= fraction_default_fem

            fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem
            fraction_with_one |= fraction_with_one_fem

        fraction_with_one @= pynini.cdrewrite(pynini.cross(
            "un medio", "medio"), "", "", NEMO_SIGMA)  # "medio" not "un medio"

        fraction = fraction_with_one | fraction_default | fraction_with_cardinal
        graph_masc = pynini.closure(integer + delete_space + conjunction, 0,
                                    1) + fraction

        # Manage cases of fem gender (only shows on integer except for "medio")
        integer_fem = shift_cardinal_gender(integer)
        fraction_default |= (
            shift_cardinal_gender(numerator) + delete_space + insert_space +
            (denominator_plural @ pynini.cross("medios", "medias")))
        fraction_with_one |= (
            pynutil.delete(numerator_one) + delete_space +
            (denominator_singular @ pynini.cross("medio", "media")))

        fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal
        graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0,
                                   1) + fraction_fem

        self.graph_masc = pynini.optimize(graph_masc)
        self.graph_fem = pynini.optimize(graph_fem)

        self.graph = graph_masc | graph_fem

        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
예제 #7
0
def brahmic_to_iso(consonant_file: os.PathLike,
                   inherent_vowel_file: os.PathLike,
                   vowel_sign_file: os.PathLike, vowel_file: os.PathLike,
                   vowel_length_sign_file: os.PathLike, coda_file: os.PathLike,
                   dead_consonant_file: os.PathLike,
                   standalone_file: os.PathLike,
                   subjoined_consonant_file: os.PathLike,
                   virama_file: os.PathLike) -> pynini.Fst:
    """Creates an FST that transduces a Brahmic script to ISO 15919.

  Args:
    consonant_file: Path relative to the runfiles directory of a StringFile containing a
      native--latin consonant mapping.
    inherent_vowel_file: Path relative to depot of a StringFile containing the
      inherent vowel.
    vowel_sign_file: Path relative to depot of a StringFile containing a
      native--latin vowel matra mapping.
    vowel_file: Path relative to depot of a StringFile containing a
      native--latin independent vowel mapping.
    vowel_length_sign_file: Path relative to depot of a StringFile containing a
      native--latin vowel length sign mapping.
    coda_file: Path relative to depot of a StringFile containing a
      native--latin coda mapping.
    dead_consonant_file: Path relative to depot of a StringFile containing a
      native--latin dead consonant mapping.
    standalone_file: Path relative to depot of a StringFile containing a
      native--latin standalone string mapping.
    subjoined_consonant_file: Path relative to depot of a StringFile containing
      a native--latin subjoined consonant mapping.
    virama_file: Path relative to depot of a StringFile containing the virama
      for the script.

  Returns:
    Brahmic script to ISO FST.
  """
    core_consonant = uf.StringFile(consonant_file)
    inherent_vowel = uf.StringFile(inherent_vowel_file)
    vowel_sign = uf.StringFile(vowel_sign_file)
    vowel = uf.StringFile(vowel_file)
    vowel_length_sign = uf.StringFile(vowel_length_sign_file)
    coda = uf.StringFile(coda_file)
    dead_consonant = uf.StringFile(dead_consonant_file)
    standalone = uf.StringFile(standalone_file)
    subjoined_consonant = uf.StringFile(subjoined_consonant_file)
    virama = uf.StringFile(virama_file)

    common_symbol = uf.StringFile(u.SCRIPT_DIR / 'common' / 'symbol.tsv')

    ins_inherent = pynutil.insert(inherent_vowel)
    ins_dash = pynutil.insert('-')
    ins_dot = pynutil.insert('.')
    del_virama = pynutil.delete(virama)
    virama_mark = pynini.cross(virama, '˘')

    low_priority_epsilon = pynini.accep('', weight=1)
    consonant = core_consonant + uf.QuesSafe(subjoined_consonant)
    convert_to_iso = pynini.union(
        consonant + vowel_sign,
        consonant + ins_inherent + low_priority_epsilon,
        consonant + del_virama + low_priority_epsilon,
        vowel + low_priority_epsilon,
        coda,
        dead_consonant,
        vowel_length_sign,
        standalone,

        # Rare cases:
        # Dangling vowel signs.
        ins_dash + vowel_sign + (ins_dot + vowel).star + low_priority_epsilon,
        virama_mark + low_priority_epsilon,  # Explicit virama elsewhere.
        common_symbol,  # Joiners.

        # Independent vowel not as the first letter:
        vowel + (ins_dot + vowel).plus + low_priority_epsilon,
        consonant + vowel_sign + (ins_dot + vowel).plus,
        consonant + del_virama + (ins_dot + vowel).plus,
        consonant + ins_inherent + (ins_dot + vowel).plus)

    return pynini.optimize(convert_to_iso.star)
onetoone_34 = pynini.transducer("ⲭ", "ch")
rule_onetoone_34 = pynini.cdrewrite(onetoone_34, "", "", sigma_star)

onetoone_35 = pynini.transducer("ϭ", "ky")
rule_onetoone_35 = pynini.cdrewrite(onetoone_35, "", "", sigma_star)

#cdrewrite

cascade = pynini.optimize(rule_addwb_1@rule_addwb_2@rule_1@rule_2@rule_4@rule_5@
                          rule_6@rule_7@rule_8@rule_9@rule_10@rule_11@rule_12@
                          rule_13@rule_14@rule_18@rule_19@rule_20@rule_21@
                          rule_onetoone_1@rule_onetoone_2@rule_onetoone_3@
                          rule_onetoone_4@rule_onetoone_5@rule_onetoone_6@
                          rule_onetoone_7@rule_onetoone_8@rule_onetoone_9@
                          rule_onetoone_10@rule_onetoone_11@rule_onetoone_12@
                          rule_onetoone_13@rule_onetoone_14@rule_onetoone_15@
                          rule_onetoone_16@rule_onetoone_17@rule_onetoone_18@
                          rule_onetoone_19@rule_onetoone_20@rule_onetoone_21@
                          rule_onetoone_22@rule_onetoone_23@rule_onetoone_24@
                          rule_onetoone_25@rule_onetoone_26@rule_onetoone_27@
                          rule_onetoone_28@rule_onetoone_29@rule_onetoone_30@
                          rule_onetoone_31@rule_onetoone_32@rule_onetoone_33@
                          rule_onetoone_34@rule_onetoone_34@rule_removewb)


def translit(text):
    text = text.lower()
    text = (text @ cascade)
    return text.string()

script, filename = argv
                                 sigma.star)

# clean up rules

cleanUp = pynini.cdrewrite(T(inflClass | category | feat, ""), "", "",
                           sigma.star)

# double plurals in Breton

newLex = (lexicon @ numRule @ dimRule).optimize()
grammar = (dimRealization @ plRealization @ cleanUp).optimize()
lexForms = (newLex @ grammar).optimize()

# Checking the program

pynini.optimize(pynini.project(lexForms, True))


def realizeNoun(m):
    x = (m @ grammar).optimize()
    y = pynini.project(x, True)
    return y.stringify(token_type="utf8")


arrow = "\t-->\t"  # \t is the 'tab' character

print("bag[sg]\t" + arrow + realizeNoun(lexicon + sg))
print("bag[pl]\t" + arrow + realizeNoun(lexicon + pl))
print("bag[sg,dim]" + arrow + realizeNoun(lexicon + sg + dim))
print("bag[pl,dim]" + arrow + realizeNoun(lexicon + pl + dim))