Пример #1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="punctuation",
                         kind="classify",
                         deterministic=deterministic)
        s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""

        punct_symbols_to_exclude = ["[", "]"]
        punct_unicode = [
            chr(i) for i in range(sys.maxunicode)
            if category(chr(i)).startswith("P")
            and chr(i) not in punct_symbols_to_exclude
        ]

        whitelist_symbols = load_labels(
            get_abs_path("data/whitelist/symbol.tsv"))
        whitelist_symbols = [x[0] for x in whitelist_symbols]
        self.punct_marks = [
            p for p in punct_unicode + list(s) if p not in whitelist_symbols
        ]

        punct = pynini.union(*self.punct_marks)
        punct = pynini.closure(punct, 1)

        emphasis = (pynini.accep("<") + (
            (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) +
             pynini.closure(pynini.accep("/"), 0, 1))
            | (pynini.accep("/") +
               pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) +
                    pynini.accep(">"))
        punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)

        self.graph = punct
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Пример #2
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize()
        graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1)

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )

        if not deterministic:
            phoneme = (
                pynini.accep(pynini.escape("["))
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.closure(phoneme_unit + pynini.accep(" "))
                + phoneme_unit
                + pynini.closure(pynini.accep(" "), 0, 1)
                + pynini.accep(pynini.escape("]"))
            )
        self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Пример #3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize()
        graph_zero = pynini.cross("0", "zero")

        if not deterministic:
            graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")

        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize()

        default_chars_symbols = pynini.cdrewrite(
            pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA
        )

        user_name = (
            pynutil.delete("username:"******"\"")
            + default_chars_symbols
            + pynutil.delete("\"")
        )

        domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))

        domain = (
            default_chars_symbols
            + insert_space
            + plurals._priority_union(
                domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA
            )
            + pynini.closure(
                insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1
            )
        )
        domain = (
            pynutil.delete("domain:")
            + delete_space
            + pynutil.delete("\"")
            + domain
            + delete_space
            + pynutil.delete("\"")
        ).optimize()

        protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        graph = (
            pynini.closure(protocol + delete_space, 0, 1)
            + pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1)
            + domain
            + delete_space
        ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Пример #4
0
def get_four_digit_year_graph(deterministic: bool = True):
    """
    Returns a four digit transducer which is combination of ties/teen or digits
    (using hundred instead of thousand format), e.g.
    1219 -> twelve nineteen
    3900 -> thirty nine hundred
    """
    graph_ties = get_ties_graph(deterministic)

    graph_with_s = (
        (graph_ties + insert_space + graph_ties)
        | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")))
    ) + pynutil.delete("0s")

    graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s")
    graph_with_s = graph_with_s @ pynini.cdrewrite(
        pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA
    )

    graph = graph_ties + insert_space + graph_ties
    graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred")

    thousand_graph = (
        graph_digit
        + insert_space
        + pynini.cross("00", "thousand")
        + (pynutil.delete("0") | insert_space + graph_digit)
    )
    thousand_graph |= (
        graph_digit
        + insert_space
        + pynini.cross("000", "thousand")
        + pynini.closure(pynutil.delete(" "), 0, 1)
        + pynini.accep("s")
    )

    graph |= graph_with_s
    if deterministic:
        graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA)
    else:
        graph |= thousand_graph

    return graph.optimize()
Пример #5
0
def singular_to_plural():
    # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or
    _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n")
    _en = (
        NEMO_SIGMA
        + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung")
        + pynutil.insert("en")
    )
    _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e") | pynutil.insert("nen"))
    _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en")
    # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör
    _e = NEMO_SIGMA + pynini.union("eur", "ich", "ier", "ig", "ling", "ör") + pynutil.insert("e")
    _s = NEMO_SIGMA + pynini.union("a", "i", "o", "u", "y") + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive, pynini.union(_n, _en, _nen, _fremd, _e, _s), NEMO_SIGMA
    ).optimize()

    return graph_plural
Пример #6
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="punctuation", kind="classify", deterministic=deterministic)
        s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\""

        punct_unicode = [
            chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in "[]"
        ]
        punct = pynini.union(*s) | pynini.union(*punct_unicode)

        emphasis = (
            pynini.accep("<")
            + (
                (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1))
                | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))
            )
            + pynini.accep(">")
        )
        punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA)

        self.graph = punct
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Пример #7
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="word", kind="classify", deterministic=deterministic)

        punct = PunctuationFst().graph
        self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1)

        if not deterministic:
            self.graph = pynini.closure(
                pynini.difference(
                    self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1)
                ),
                1,
            )

        # leave phones of format [HH AH0 L OW1] untouched
        phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT)
        phoneme = (
            pynini.accep(pynini.escape("["))
            + pynini.closure(phoneme_unit + pynini.accep(" "))
            + phoneme_unit
            + pynini.accep(pynini.escape("]"))
        )
        self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA)
        self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
Пример #8
0
def _get_two_digit_year(cardinal_graph, single_digits_graph):
    wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA)
    return wo_digit_year
Пример #9
0
    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                      "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
    _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
    _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
                                    "z") + pynutil.insert("es")
    _s = NEMO_SIGMA + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive,
        plurals._priority_union(_ies,
                                plurals._priority_union(_es, _s, NEMO_SIGMA),
                                NEMO_SIGMA), NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural
    PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
    TO_LOWER = pynini.union(*[
        pynini.cross(x, y)
        for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)
    ])
    TO_UPPER = pynini.invert(TO_LOWER)

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Create placeholders
    NEMO_CHAR = None
Пример #10
0
    # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or
    _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n")
    _en = (NEMO_SIGMA +
           pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit",
                        "keit", "schaft", "tät", "ung") + pynutil.insert("en"))
    _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e")
                                              | pynutil.insert("nen"))
    _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en")
    # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör
    _e = NEMO_SIGMA + pynini.union("eur", "ich", "ier", "ig", "ling",
                                   "ör") + pynutil.insert("e")
    _s = NEMO_SIGMA + pynini.union("a", "i", "o", "u",
                                   "y") + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive, pynini.union(_n, _en, _nen, _fremd, _e, _s),
        NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural
    PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
    TO_LOWER = pynini.union(*[
        pynini.cross(x, y)
        for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)
    ])
    TO_UPPER = pynini.invert(TO_LOWER)

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Create placeholders
    NEMO_CHAR = None
Пример #11
0
    def __init__(self,
                 cardinal: GraphFst,
                 ordinal: GraphFst,
                 deterministic: bool = True,
                 lm: bool = False):
        super().__init__(name="integer",
                         kind="classify",
                         deterministic=deterministic)
        """
        Finite state transducer for classifying serial (handles only cases without delimiters,
        values with delimiters are handled by default).
            The serial is a combination of digits, letters and dashes, e.g.:
            c325b -> tokens { cardinal { integer: "c three two five b" } }
        """
        num_graph = pynini.compose(NEMO_DIGIT**(6, ...),
                                   cardinal.single_digits_graph).optimize()
        num_graph |= pynini.compose(NEMO_DIGIT**(1, 5),
                                    cardinal.graph).optimize()
        # to handle numbers starting with zero
        num_graph |= pynini.compose(
            pynini.accep("0") + pynini.closure(NEMO_DIGIT),
            cardinal.single_digits_graph).optimize()
        # TODO: "#" doesn't work from the file
        symbols_graph = pynini.string_file(
            get_abs_path("data/whitelist/symbol.tsv")).optimize(
            ) | pynini.cross("#", "hash")
        num_graph |= symbols_graph

        if not self.deterministic and not lm:
            num_graph |= cardinal.single_digits_graph
            # also allow double digits to be pronounced as integer in serial number
            num_graph |= pynutil.add_weight(
                NEMO_DIGIT**2 @ cardinal.
                graph_hundred_component_at_least_one_none_zero_digit,
                weight=0.0001)

        # add space between letter and digit/symbol
        symbols = [
            x[0]
            for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))
        ]
        symbols = pynini.union(*symbols)
        digit_symbol = NEMO_DIGIT | symbols

        graph_with_space = pynini.compose(
            pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols,
                             digit_symbol, NEMO_SIGMA),
            pynini.cdrewrite(pynutil.insert(" "), digit_symbol,
                             NEMO_ALPHA | symbols, NEMO_SIGMA),
        )

        # serial graph with delimiter
        delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")
        alphas = pynini.closure(NEMO_ALPHA, 1)
        letter_num = alphas + delimiter + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
        next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
        next_alpha_or_num |= pynini.closure(
            delimiter + num_graph +
            plurals._priority_union(pynini.accep(" "), pynutil.insert(" "),
                                    NEMO_SIGMA).optimize() + alphas)

        serial_graph = letter_num + next_alpha_or_num
        serial_graph |= num_letter + next_alpha_or_num
        # numbers only with 2+ delimiters
        serial_graph |= (num_graph + delimiter + num_graph + delimiter +
                         num_graph + pynini.closure(delimiter + num_graph))
        # 2+ symbols
        serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA,
                                       num_graph + delimiter + num_graph)

        # exclude ordinal numbers from serial options
        serial_graph = pynini.compose(
            pynini.difference(NEMO_SIGMA,
                              pynini.project(ordinal.graph, "input")),
            serial_graph).optimize()

        serial_graph = pynutil.add_weight(serial_graph, 0.0001)
        serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) +
                         (pynini.cross("^2", " squared")
                          | pynini.cross("^3", " cubed")).optimize())

        # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
        serial_graph = (
            pynini.closure((serial_graph | num_graph | alphas) + delimiter) +
            serial_graph + pynini.closure(delimiter +
                                          (serial_graph | num_graph | alphas)))

        serial_graph |= pynini.compose(graph_with_space,
                                       serial_graph.optimize()).optimize()
        serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2),
                                      serial_graph).optimize()

        self.graph = serial_graph.optimize()
        graph = pynutil.insert("name: \"") + convert_space(
            self.graph).optimize() + pynutil.insert("\"")
        self.fst = graph.optimize()
Пример #12
0
    def __init__(
        self,
        input_case: str,
        deterministic: bool = True,
        cache_dir: str = None,
        overwrite_cache: bool = True,
        whitelist: str = None,
    ):
        super().__init__(name="tokenize_and_classify",
                         kind="classify",
                         deterministic=deterministic)

        far_file = None
        if cache_dir is not None and cache_dir != 'None':
            os.makedirs(cache_dir, exist_ok=True)
            whitelist_file = os.path.basename(whitelist) if whitelist else ""
            far_file = os.path.join(
                cache_dir,
                f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}_lm.far"
            )
        if not overwrite_cache and far_file and os.path.exists(far_file):
            self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify']
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()
            logging.info(f'ClassifyFst.fst was restored from {far_file}.')
        else:
            logging.info(
                f'Creating ClassifyFst grammars. This might take some time...')
            # TAGGERS
            cardinal = CardinalFst(deterministic=True, lm=True)
            cardinal_tagger = cardinal
            cardinal_graph = cardinal.fst

            ordinal = OrdinalFst(cardinal=cardinal, deterministic=True)
            ordinal_graph = ordinal.fst

            decimal = DecimalFst(cardinal=cardinal, deterministic=True)
            decimal_graph = decimal.fst
            fraction = FractionFst(deterministic=True, cardinal=cardinal)
            fraction_graph = fraction.fst

            measure = MeasureFst(cardinal=cardinal,
                                 decimal=decimal,
                                 fraction=fraction,
                                 deterministic=True)
            measure_graph = measure.fst
            date = DateFst(cardinal=cardinal, deterministic=True, lm=True)
            date_graph = date.fst
            word_graph = WordFst(deterministic=deterministic).graph
            time_graph = TimeFst(cardinal=cardinal, deterministic=True).fst
            telephone_graph = TelephoneFst(deterministic=True).fst
            electronic_graph = ElectronicFst(deterministic=True).fst
            money_graph = MoneyFst(cardinal=cardinal,
                                   decimal=decimal,
                                   deterministic=False).fst
            whitelist = WhiteListFst(input_case=input_case,
                                     deterministic=False,
                                     input_file=whitelist)
            whitelist_graph = whitelist.graph
            punct_graph = PunctuationFst(deterministic=True).graph
            serial_graph = SerialFst(cardinal=cardinal,
                                     ordinal=ordinal,
                                     deterministic=deterministic,
                                     lm=True).fst

            # VERBALIZERS
            cardinal = vCardinal(deterministic=True)
            v_cardinal_graph = cardinal.fst
            decimal = vDecimal(cardinal=cardinal, deterministic=True)
            v_decimal_graph = decimal.fst
            ordinal = vOrdinal(deterministic=True)
            v_ordinal_graph = ordinal.fst
            fraction = vFraction(deterministic=True, lm=True)
            v_fraction_graph = fraction.fst
            v_telephone_graph = vTelephone(deterministic=True).fst
            v_electronic_graph = vElectronic(deterministic=True).fst
            measure = vMeasure(decimal=decimal,
                               cardinal=cardinal,
                               fraction=fraction,
                               deterministic=False)
            v_measure_graph = measure.fst
            v_time_graph = vTime(deterministic=True).fst
            v_date_graph = vDate(ordinal=ordinal,
                                 deterministic=deterministic,
                                 lm=True).fst
            v_money_graph = vMoney(decimal=decimal,
                                   deterministic=deterministic).fst
            v_roman_graph = vRoman(deterministic=deterministic).fst

            time_final = pynini.compose(time_graph, v_time_graph)

            cardinal_or_date_final = plurals._priority_union(
                date_graph, cardinal_graph, NEMO_SIGMA)
            cardinal_or_date_final = pynini.compose(
                cardinal_or_date_final, (v_cardinal_graph | v_date_graph))

            sem_w = 1
            word_w = 100
            punct_w = 2
            classify_and_verbalize = (
                pynutil.add_weight(time_final, sem_w)
                | pynutil.add_weight(
                    pynini.compose(decimal_graph, v_decimal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(measure_graph, v_measure_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(ordinal_graph, v_ordinal_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(telephone_graph, v_telephone_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(electronic_graph, v_electronic_graph),
                    sem_w)
                | pynutil.add_weight(
                    pynini.compose(fraction_graph, v_fraction_graph), sem_w)
                | pynutil.add_weight(
                    pynini.compose(money_graph, v_money_graph), sem_w)
                | pynutil.add_weight(cardinal_or_date_final, sem_w)
                | pynutil.add_weight(whitelist_graph, sem_w)
                | pynutil.add_weight(
                    pynini.compose(serial_graph, v_cardinal_graph),
                    1.1001)  # should be higher than the rest of the classes
            ).optimize()

            roman_graph = RomanFst(deterministic=deterministic, lm=True).fst
            # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens
            classify_and_verbalize |= pynutil.add_weight(
                pynini.compose(roman_graph, v_roman_graph), sem_w)

            date_final = pynini.compose(date_graph, v_date_graph)
            range_graph = RangeFst(time=time_final,
                                   cardinal=cardinal_tagger,
                                   date=date_final,
                                   deterministic=deterministic).fst
            v_word_graph = vWord(deterministic=deterministic).fst
            classify_and_verbalize |= pynutil.add_weight(
                pynini.compose(range_graph, v_word_graph), sem_w)
            classify_and_verbalize = pynutil.insert(
                "< ") + classify_and_verbalize + pynutil.insert(" >")
            classify_and_verbalize |= pynutil.add_weight(word_graph, word_w)

            punct_only = pynutil.add_weight(punct_graph, weight=punct_w)
            punct = pynini.closure(
                pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                               delete_extra_space)
                | (pynutil.insert(" ") + punct_only),
                1,
            )

            def get_token_sem_graph(classify_and_verbalize):
                token_plus_punct = (
                    pynini.closure(punct + pynutil.insert(" ")) +
                    classify_and_verbalize +
                    pynini.closure(pynutil.insert(" ") + punct))

                graph = token_plus_punct + pynini.closure(
                    (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1),
                                    delete_extra_space)
                     | (pynutil.insert(" ") + punct + pynutil.insert(" "))) +
                    token_plus_punct)

                graph |= punct_only + pynini.closure(punct)
                graph = delete_space + graph + delete_space

                remove_extra_spaces = pynini.closure(
                    NEMO_NOT_SPACE,
                    1) + pynini.closure(delete_extra_space +
                                        pynini.closure(NEMO_NOT_SPACE, 1))
                remove_extra_spaces |= (
                    pynini.closure(pynutil.delete(" "), 1) +
                    pynini.closure(NEMO_NOT_SPACE, 1) +
                    pynini.closure(delete_extra_space +
                                   pynini.closure(NEMO_NOT_SPACE, 1)))

                graph = pynini.compose(graph.optimize(),
                                       remove_extra_spaces).optimize()
                return graph

            self.fst = get_token_sem_graph(classify_and_verbalize)
            no_digits = pynini.closure(pynini.difference(
                NEMO_CHAR, NEMO_DIGIT))
            self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize()

            if far_file:
                generator_main(far_file, {"tokenize_and_classify": self.fst})
                logging.info(f'ClassifyFst grammars are saved to {far_file}.')
Пример #13
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = NEMO_DIGIT**(
            1,
            2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
        address_num += insert_space + NEMO_DIGIT**2 @ (
            pynini.closure(pynini.cross("0", "zero "), 0, 1) +
            cardinal.graph_hundred_component_at_least_one_none_zero_digit)
        # to handle the rest of the numbers
        address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num)
        address_num = plurals._priority_union(address_num, cardinal.graph,
                                              NEMO_SIGMA)

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North")) + pynini.closure(
                         pynutil.delete("."), 0, 1)

        direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1)
        address_words = get_formats(
            get_abs_path("data/address/address_word.tsv"))
        address_words = (
            pynini.accep(NEMO_SPACE) +
            (pynini.closure(ordinal_num, 0, 1)
             | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE +
            pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) +
                           NEMO_SPACE) + address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        states = load_labels(get_abs_path("data/address/state.tsv"))

        additional_options = []
        for x, y in states:
            additional_options.append((x, f"{y[0]}.{y[1:]}"))
        states.extend(additional_options)
        state_graph = pynini.string_map(states)
        state = pynini.invert(state_graph)
        state = pynini.closure(
            pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynini.closure(pynini.accep(","), 0, 1) +
            pynini.accep(NEMO_SPACE) + zip_code,
            0,
            1,
        )

        address = address_num + direction + address_words + pynini.closure(
            city + state + zip_code, 0, 1)

        address |= address_num + direction + address_words + pynini.closure(
            pynini.cross(".", ""), 0, 1)

        return address
Пример #14
0
    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")

    # French frequently compounds numbers with hyphen.
    delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1))
    insert_hyphen = pynutil.insert("-")
    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))

    _s = NEMO_SIGMA + pynutil.insert("s")
    _x = NEMO_SIGMA + pynini.string_map([("eau"), ("eu"),
                                         ("ou")]) + pynutil.insert("x")
    _aux = NEMO_SIGMA + pynini.string_map([("al", "aux"), ("ail", "aux")])

    graph_plural = plurals._priority_union(
        suppletive,
        plurals._priority_union(_s, pynini.union(_x, _aux), NEMO_SIGMA),
        NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural
    PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
    TO_LOWER = pynini.union(*[
        pynini.cross(x, y)
        for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)
    ])
    TO_UPPER = pynini.invert(TO_LOWER)

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Create placeholders
    NEMO_CHAR = None
Пример #15
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)

        self.lm = lm
        self.deterministic = deterministic
        # TODO replace to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/number/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph

        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph = (pynini.closure(NEMO_DIGIT, 1, 3) +
                 (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3)
                  | pynini.closure(NEMO_DIGIT**3))) @ graph

        self.graph = graph
        self.graph_with_and = self.add_optional_and(graph)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = plurals._priority_union(long_numbers,
                                                  self.graph_with_and,
                                                  NEMO_SIGMA).optimize()
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:
            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") + pynini.compose(
                    pynini.closure(NEMO_DIGIT), self.graph_with_and))

            # add small weight to non-default graphs to make sure the deterministic option is listed first
            final_graph = (
                self.graph_with_and
                | pynutil.add_weight(self.single_digits_graph, 0.0001)
                | get_four_digit_year_graph(
                )  # allows e.g. 4567 be pronouced as forty five sixty seven
                | pynutil.add_weight(single_digits_graph_with_commas, 0.0001)
                | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Пример #16
0
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="fraction",
                         kind="verbalize",
                         deterministic=deterministic)
        suffix = OrdinalFst().suffix

        integer = pynutil.delete("integer_part: \"") + pynini.closure(
            NEMO_NOT_QUOTE) + pynutil.delete("\" ")
        denominator_one = pynini.cross("denominator: \"one\"", "over one")
        denominator_half = pynini.cross("denominator: \"two\"", "half")
        denominator_quarter = pynini.cross("denominator: \"four\"", "quarter")

        denominator_rest = (pynutil.delete("denominator: \"") +
                            pynini.closure(NEMO_NOT_QUOTE) @ suffix +
                            pynutil.delete("\""))

        denominators = plurals._priority_union(
            denominator_one,
            plurals._priority_union(
                denominator_half,
                plurals._priority_union(denominator_quarter, denominator_rest,
                                        NEMO_SIGMA),
                NEMO_SIGMA,
            ),
            NEMO_SIGMA,
        ).optimize()
        if not deterministic:
            denominators |= pynutil.delete("denominator: \"") + (
                pynini.accep("four") @ suffix) + pynutil.delete("\"")

        numerator_one = pynutil.delete("numerator: \"") + pynini.accep(
            "one") + pynutil.delete("\" ")
        numerator_one = numerator_one + insert_space + denominators
        numerator_rest = (
            pynutil.delete("numerator: \"") +
            (pynini.closure(NEMO_NOT_QUOTE) - pynini.accep("one")) +
            pynutil.delete("\" "))
        numerator_rest = numerator_rest + insert_space + denominators
        numerator_rest @= pynini.cdrewrite(
            plurals._priority_union(pynini.cross("half", "halves"),
                                    pynutil.insert("s"), NEMO_SIGMA),
            "",
            "[EOS]",
            NEMO_SIGMA,
        )

        graph = numerator_one | numerator_rest

        conjunction = pynutil.insert("and ")
        if not deterministic and not lm:
            conjunction = pynini.closure(conjunction, 0, 1)

        integer = pynini.closure(integer + insert_space + conjunction, 0, 1)

        graph = integer + graph
        graph @= pynini.cdrewrite(
            pynini.cross("and one half", "and a half")
            | pynini.cross("over ones", "over one"), "", "[EOS]", NEMO_SIGMA)

        self.graph = graph
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()