Exemplo n.º 1
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="ordinal",
                         kind="verbalize",
                         deterministic=deterministic)

        graph_digit = pynini.string_file(
            get_abs_path("data/ordinals/digit.tsv")).invert()
        graph_teens = pynini.string_file(
            get_abs_path("data/ordinals/teen.tsv")).invert()

        graph = (pynutil.delete("integer:") + delete_space +
                 pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) +
                 pynutil.delete("\""))
        convert_rest = pynutil.insert("th", weight=0.01)

        suffix = pynini.cdrewrite(
            graph_digit | graph_teens
            | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001)
            | convert_rest,
            "",
            "[EOS]",
            NEMO_SIGMA,
        ).optimize()
        self.graph = pynini.compose(graph, suffix)
        self.suffix = suffix
        delete_tokens = self.delete_tokens(self.graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 2
0
    def get_address_graph(self, cardinal):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            2788 San Tomas Expy, Santa Clara, CA 95051 ->
                units: "address" cardinal
                { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" }
                 preserve_order: true
        """
        ordinal_verbalizer = OrdinalVerbalizer().graph
        ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph
        ordinal_num = pynini.compose(
            pynutil.insert("integer: \"") + ordinal_tagger +
            pynutil.insert("\""), ordinal_verbalizer)

        address_num = pynini.closure(NEMO_DIGIT,
                                     1) @ cardinal.single_digits_graph

        direction = (pynini.cross("E", "East")
                     | pynini.cross("S", "South")
                     | pynini.cross("W", "West")
                     | pynini.cross("N", "North"))
        direction = pynini.closure(
            pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1)

        address_words = pynini.string_file(
            get_abs_path("data/address/address_words.tsv"))
        address_words = (pynini.accep(NEMO_SPACE) +
                         pynini.closure(ordinal_num, 0, 1) +
                         pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) +
                         address_words)

        city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1)
        city = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1)

        state = pynini.invert(
            pynini.string_file(get_abs_path("data/address/states.tsv")))
        state = pynini.closure(
            pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1)

        zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph)
        zip_code = pynini.closure(
            pynutil.add_weight(
                pynini.closure(pynini.cross(",", ""), 0, 1) +
                pynini.accep(NEMO_SPACE) + zip_code, -100),
            0,
            1,
        )

        address = (address_num + direction + address_words +
                   pynini.closure(pynini.cross(".", ""), 0, 1) + city + state +
                   zip_code)
        return address
Exemplo n.º 3
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="electronic",
                         kind="verbalize",
                         deterministic=deterministic)
        graph_digit_no_zero = pynini.invert(
            pynini.string_file(
                get_abs_path("data/numbers/digit.tsv"))).optimize()
        graph_zero = pynini.cross("0", "zero")

        if not deterministic:
            graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")

        graph_digit = graph_digit_no_zero | graph_zero
        graph_symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).optimize()
        user_name = (
            pynutil.delete("username:"******"\"") +
            (pynini.closure(
                pynutil.add_weight(graph_digit + insert_space, 1.09)
                | pynutil.add_weight(
                    pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09)
                | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) +
            pynutil.delete("\""))

        server_common = pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain_common = pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))

        convert_defaults = (NEMO_NOT_QUOTE
                            | pynutil.add_weight(domain_common, -0.1)
                            | pynutil.add_weight(server_common, -0.1))
        domain = convert_defaults + pynini.closure(
            pynutil.insert(" ") + convert_defaults)
        domain = pynini.compose(
            domain,
            pynini.closure(
                pynutil.add_weight(graph_symbols, -0.1)
                | pynutil.add_weight(graph_digit, -0.1) | NEMO_NOT_QUOTE),
        )

        domain = (pynutil.delete("domain:") + delete_space +
                  pynutil.delete("\"") + domain + delete_space +
                  pynutil.delete("\""))

        protocol = pynutil.delete("protocol: \"") + pynini.closure(
            NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
        graph = (pynini.closure(protocol + delete_space, 0, 1) +
                 pynini.closure(
                     user_name + delete_space + pynutil.insert("at ") +
                     delete_space, 0, 1) + domain + delete_space)

        delete_tokens = self.delete_tokens(graph)
        self.fst = delete_tokens.optimize()
Exemplo n.º 4
0
 def _get_whitelist_non_deterministic_graph(
         file="data/whitelist_alternatives.tsv"):
     whitelist = load_labels(get_abs_path(file))
     whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
     whitelist_cased = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist_lower + whitelist_cased)
     return graph
Exemplo n.º 5
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="telephone",
                         kind="classify",
                         deterministic=deterministic)

        add_separator = pynutil.insert(", ")  # between components
        digit = pynini.invert(
            pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
        ).optimize() | pynini.cross("0", "o")

        country_code = (pynutil.insert("country_code: \"") +
                        pynini.closure(pynutil.delete("+"), 0, 1) +
                        pynini.closure(digit + insert_space, 0, 2) + digit +
                        pynutil.insert("\""))
        optional_country_code = pynini.closure(
            country_code + pynini.closure(pynutil.delete("-"), 0, 1) +
            delete_space + insert_space, 0, 1)

        area_part_common = pynutil.add_weight(
            pynini.cross("800", "eight hundred"), -1.1)
        area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit
        area_part = area_part_default | area_part_common

        area_part = (
            (area_part + pynutil.delete("-"))
            | (pynutil.delete("(") + area_part +
               (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator

        del_separator = pynini.closure(pynini.union("-", " "), 0, 1)
        number_length = ((NEMO_DIGIT + del_separator) |
                         (NEMO_ALPHA + del_separator))**7
        number_words = pynini.closure((NEMO_DIGIT @ digit) +
                                      (insert_space | pynini.cross("-", ', '))
                                      | NEMO_ALPHA
                                      | (NEMO_ALPHA + pynini.cross("-", ' ')))
        number_words = pynini.compose(number_length, number_words)
        number_part = area_part + number_words
        number_part = pynutil.insert(
            "number_part: \"") + number_part + pynutil.insert("\"")
        extension = (pynutil.insert("extension: \"") +
                     pynini.closure(digit + insert_space, 0, 3) + digit +
                     pynutil.insert("\""))
        optional_extension = pynini.closure(insert_space + extension, 0, 1)

        graph = optional_country_code + number_part + optional_extension
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Exemplo n.º 6
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 input_file: str = None):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [(x.lower(), y) for x, y in whitelist]
            else:
                whitelist = [(x, y) for x, y in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        def _get_whitelist_non_deterministic_graph(
                file="data/whitelist_alternatives.tsv"):
            whitelist = load_labels(get_abs_path(file))
            whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
            whitelist_cased = [(x, y) for x, y in whitelist]
            graph = pynini.string_map(whitelist_lower + whitelist_cased)
            return graph

        graph = _get_whitelist_graph(input_case,
                                     get_abs_path("data/whitelist.tsv"))
        if not deterministic:
            graph |= _get_whitelist_non_deterministic_graph()

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        self.graph = (convert_space(graph)).optimize()
        self.fst = (pynutil.insert("name: \"") + self.graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 7
0
 def _load_roman(file: str):
     roman = load_labels(get_abs_path(file))
     roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y)
                                                    for x, y in roman]
     return pynini.string_map(roman_numerals)
Exemplo n.º 8
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)
        suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv"))
        time_zone_graph = pynini.string_file(
            get_abs_path("data/time_zone.tsv"))

        # only used for < 1000 thousand -> 0 weight
        cardinal = cardinal.graph

        labels_hour = [str(x) for x in range(0, 24)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT)

        graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_hour) @ cardinal

        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hour + pynutil.insert("\"")
        final_graph_minute = (
            pynutil.insert("minutes: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_graph_second = (
            pynutil.insert("seconds: \"") +
            (pynini.cross("0", "o") + insert_space + graph_minute_single
             | graph_minute_double) + pynutil.insert("\""))
        final_suffix = pynutil.insert("suffix: \"") + convert_space(
            suffix_graph) + pynutil.insert("\"")
        final_suffix_optional = pynini.closure(
            delete_space + insert_space + final_suffix, 0, 1)
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        # 2:30 pm, 02:30, 2:00
        graph_hm = (
            final_graph_hour + pynutil.delete(":") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            final_suffix_optional + final_time_zone_optional)

        # 10:30:05 pm,
        graph_hms = (final_graph_hour + pynutil.delete(":") +
                     (pynini.cross("00", " minutes: \"zero\"")
                      | insert_space + final_graph_minute) +
                     pynutil.delete(":") +
                     (pynini.cross("00", " seconds: \"zero\"")
                      | insert_space + final_graph_second) +
                     final_suffix_optional + final_time_zone_optional)

        # 2.xx pm/am
        graph_hm2 = (
            final_graph_hour + pynutil.delete(".") +
            (pynutil.delete("00") | insert_space + final_graph_minute) +
            delete_space + insert_space + final_suffix +
            final_time_zone_optional)
        # 2 pm est
        graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional
        final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 9
0
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose(
            TO_LOWER ** (2, ...), month_graph
        )

        month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (
            month_abbr_graph
            | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize()
            | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize()
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph.optimize()

        month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (
            pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")
        )

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"")

        endings = ["rd", "th", "st", "nd"]
        endings += [x.upper() for x in endings]
        endings = pynini.union(*endings)

        day_graph = (
            pynutil.insert("day: \"")
            + pynini.closure(pynutil.delete("the "), 0, 1)
            + (
                ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1")))
                + pynini.closure(pynutil.delete(endings), 0, 1)
            )
            @ cardinal_graph
            + pynutil.insert("\"")
        )

        two_digit_year = NEMO_DIGIT ** (2) @ (cardinal.single_digits_graph | cardinal_graph)
        two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"")
        graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"")
        optional_graph_year = pynini.closure(graph_year, 0, 1)
        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")

        graph_mdy = month_graph + (
            (delete_extra_space + day_graph)
            | (pynini.accep(" ") + day_graph)
            | graph_year
            | (delete_extra_space + day_graph + graph_year)
        )

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (
            month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
            + delete_sep
            + insert_space
            + (year_graph | two_digit_year)
        )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_ymd = (
            (year_graph | two_digit_year)
            + delete_sep
            + insert_space
            + month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
        )

        final_graph = graph_mdy | graph_dmy

        if deterministic:
            final_graph += pynutil.insert(" preserve_order: true")
        else:
            final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1)
            m_sep_d = (
                month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph
            )
            final_graph |= m_sep_d

        final_graph |= graph_ymd | year_graph_standalone

        if not deterministic:
            ymd_to_mdy_graph = None
            ymd_to_dmy_graph = None
            mdy_to_dmy_graph = None
            md_to_dm_graph = None

            for month in [x[0] for x in load_labels(get_abs_path("data/months/names.tsv"))]:
                for day in [x[0] for x in load_labels(get_abs_path("data/months/days.tsv"))]:
                    ymd_to_mdy_curr = (
                        pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
                    ymd_to_mdy_graph = (
                        ymd_to_mdy_curr
                        if ymd_to_mdy_graph is None
                        else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph)
                    )

                    ymd_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                        + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"")
                    )

                    # YY-MM-DD -> MM-DD-YY
                    ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize()
                    ymd_to_dmy_graph = (
                        ymd_to_dmy_curr
                        if ymd_to_dmy_graph is None
                        else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph)
                    )

                    mdy_to_dmy_curr = (
                        pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ")
                        + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ")
                        + pynini.accep('year:')
                        + NEMO_SIGMA
                    ).optimize()
                    # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
                    mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize()
                    mdy_to_dmy_graph = (
                        mdy_to_dmy_curr
                        if mdy_to_dmy_graph is None
                        else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize()
                    ).optimize()

                    md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete(
                        "month: \"" + month + "\" day: \"" + day + "\""
                    )
                    md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize()

                    md_to_dm_graph = (
                        md_to_dm_curr
                        if md_to_dm_graph is None
                        else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize()
                    ).optimize()

            final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 10
0
    NEMO_CHAR,
    NEMO_DIGIT,
    NEMO_LOWER,
    NEMO_SIGMA,
    TO_LOWER,
    GraphFst,
    delete_extra_space,
    insert_space,
)
from nemo_text_processing.text_normalization.ar.utils import get_abs_path, load_labels

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.invert(pynini.string_file(get_abs_path("data/numbers/teen.tsv"))).optimize()
    graph_digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize()
    ties_graph = pynini.invert(pynini.string_file(get_abs_path("data/numbers/ties.tsv"))).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    # Add placeholders for global variables
    graph_teen = None
    graph_digit = None
    ties_graph = None

    PYNINI_AVAILABLE = True


def get_ties_graph(deterministic: bool = True):
    """
Exemplo n.º 11
0
    def __init__(self,
                 cardinal: GraphFst,
                 decimal: GraphFst,
                 fraction: GraphFst,
                 deterministic: bool = True):
        super().__init__(name="measure",
                         kind="classify",
                         deterministic=deterministic)
        cardinal_graph = cardinal.graph

        if not deterministic:
            cardinal_graph |= cardinal.range_graph

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit |= pynini.compose(
            pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA),
            graph_unit)

        graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL)
        graph_unit = convert_space(graph_unit)
        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert(
            NEMO_NON_BREAKING_SPACE) + graph_unit

        optional_graph_unit2 = pynini.closure(
            delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) +
            graph_unit2,
            0,
            1,
        )

        unit_plural = (
            pynutil.insert("units: \"") +
            (graph_unit_plural + optional_graph_unit2 | graph_unit2) +
            pynutil.insert("\""))

        unit_singular = (pynutil.insert("units: \"") +
                         (graph_unit + optional_graph_unit2 | graph_unit2) +
                         pynutil.insert("\""))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative + delete_space +
                            pynutil.insert(" } ") + unit_plural)

        subgraph_cardinal = (pynutil.insert("cardinal { ") +
                             optional_graph_negative +
                             pynutil.insert("integer: \"") +
                             ((NEMO_SIGMA - "1") @ cardinal_graph) +
                             delete_space + pynutil.insert("\"") +
                             pynutil.insert(" } ") + unit_plural)

        subgraph_cardinal |= (pynutil.insert("cardinal { ") +
                              optional_graph_negative +
                              pynutil.insert("integer: \"") +
                              pynini.cross("1", "one") + delete_space +
                              pynutil.insert("\"") + pynutil.insert(" } ") +
                              unit_singular)

        cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") +
                               cardinal_graph + pynini.accep('-') +
                               pynutil.insert("\" } units: \"") +
                               pynini.closure(NEMO_ALPHA, 1) +
                               pynutil.insert("\""))

        alpha_dash_cardinal = (pynutil.insert("units: \"") +
                               pynini.closure(NEMO_ALPHA, 1) +
                               pynini.accep('-') + pynutil.insert("\"") +
                               pynutil.insert(" cardinal { integer: \"") +
                               cardinal_graph +
                               pynutil.insert("\" } preserve_order: true"))

        decimal_dash_alpha = (pynutil.insert("decimal { ") +
                              decimal.final_graph_wo_negative +
                              pynini.cross('-', '') +
                              pynutil.insert(" } units: \"") +
                              pynini.closure(NEMO_ALPHA, 1) +
                              pynutil.insert("\""))

        decimal_times = (pynutil.insert("decimal { ") +
                         decimal.final_graph_wo_negative +
                         pynutil.insert(" } units: \"") +
                         pynini.cross(pynini.union('x', "X"), 'x') +
                         pynutil.insert("\""))

        alpha_dash_decimal = (pynutil.insert("units: \"") +
                              pynini.closure(NEMO_ALPHA, 1) +
                              pynini.accep('-') + pynutil.insert("\"") +
                              pynutil.insert(" decimal { ") +
                              decimal.final_graph_wo_negative +
                              pynutil.insert(" } preserve_order: true"))

        subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph +
                             delete_space + pynutil.insert(" } ") +
                             unit_plural)

        address = self.get_address_graph(cardinal)
        address = (
            pynutil.insert("units: \"address\" cardinal { integer: \"") +
            address + pynutil.insert("\" } preserve_order: true"))

        math_operations = pynini.string_file(
            get_abs_path("data/math_operations.tsv"))
        delimiter = pynini.accep(" ") | pynutil.insert(" ")

        math = (cardinal_graph + delimiter + math_operations + delimiter +
                cardinal_graph + delimiter + pynini.cross("=", "equals") +
                delimiter + cardinal_graph)
        math = (pynutil.insert("units: \"math\" cardinal { integer: \"") +
                math + pynutil.insert("\" } preserve_order: true"))
        final_graph = (subgraph_decimal
                       | subgraph_cardinal
                       | cardinal_dash_alpha
                       | alpha_dash_cardinal
                       | decimal_dash_alpha
                       | decimal_times
                       | alpha_dash_decimal
                       | subgraph_fraction
                       | address
                       | math)
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 12
0
    def __init__(self, deterministic: bool = True):
        super().__init__(name="cardinal",
                         kind="classify",
                         deterministic=deterministic)
        # TODO repalce to have "oh" as a default for "0"
        graph = pynini.Far(
            get_abs_path("data/numbers/cardinal_number_name.far")).get_fst()
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            pynini.closure(NEMO_DIGIT, 2, 3)
            | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph
        self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure(
            pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT +
            NEMO_DIGIT + NEMO_DIGIT)) @ graph

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))

        single_digits_graph = pynini.invert(graph_digit | graph_zero)
        self.single_digits_graph = single_digits_graph + pynini.closure(
            insert_space + single_digits_graph)

        if not deterministic:
            # for a single token allow only the same normalization
            # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"}
            single_digits_graph_zero = pynini.invert(graph_digit | graph_zero)
            single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross(
                "0", "oh")

            self.single_digits_graph = single_digits_graph_zero + pynini.closure(
                insert_space + single_digits_graph_zero)
            self.single_digits_graph |= single_digits_graph_oh + pynini.closure(
                insert_space + single_digits_graph_oh)

            single_digits_graph_with_commas = pynini.closure(
                self.single_digits_graph + insert_space, 1,
                3) + pynini.closure(
                    pynutil.delete(",") + single_digits_graph + insert_space +
                    single_digits_graph + insert_space + single_digits_graph,
                    1,
                )

            self.range_graph = pynutil.insert(
                "from ") + self.graph + pynini.cross("-", " to ") + self.graph
            self.range_graph |= self.graph + (pynini.cross(
                "x", " by ") | pynini.cross(" x ", " by ")) + self.graph
            self.range_graph |= (pynutil.insert("from ") +
                                 get_hundreds_graph() +
                                 pynini.cross("-", " to ") +
                                 get_hundreds_graph())
            self.range_graph = self.range_graph.optimize()

        serial_graph = self.get_serial_graph()
        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0,
            1)

        if deterministic:
            long_numbers = pynini.compose(NEMO_DIGIT**(5, ...),
                                          self.single_digits_graph).optimize()
            final_graph = self.graph | serial_graph | pynutil.add_weight(
                long_numbers, -0.001)
            cardinal_with_leading_zeros = pynini.compose(
                pynini.accep("0") + pynini.closure(NEMO_DIGIT),
                self.single_digits_graph)
            final_graph |= cardinal_with_leading_zeros
        else:

            leading_zeros = pynini.compose(
                pynini.closure(pynini.accep("0"), 1), self.single_digits_graph)
            cardinal_with_leading_zeros = (
                leading_zeros + pynutil.insert(" ") +
                pynini.compose(pynini.closure(NEMO_DIGIT), self.graph))

            final_graph = (self.graph
                           | serial_graph
                           | self.range_graph
                           | self.single_digits_graph
                           | get_hundreds_graph()
                           | pynutil.add_weight(
                               single_digits_graph_with_commas, 0.001)
                           | cardinal_with_leading_zeros)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + final_graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Exemplo n.º 13
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True):
        super().__init__(name="money", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph
        graph_decimal_final = decimal.final_graph_wo_negative

        maj_singular_labels = load_labels(get_abs_path("data/currency/currency.tsv"))
        maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL)
        maj_unit_singular = convert_space(maj_singular)

        graph_maj_singular = pynutil.insert("currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
        graph_maj_plural = pynutil.insert("currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")

        optional_delete_fractional_zeros = pynini.closure(
            pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1
        )

        graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross("1", "one") + pynutil.insert("\"")
        # only for decimals where third decimal after comma is non-zero or with quantity
        decimal_delete_last_zeros = (
            pynini.closure(NEMO_DIGIT | pynutil.delete(","))
            + pynini.accep(".")
            + pynini.closure(NEMO_DIGIT, 2)
            + (NEMO_DIGIT - "0")
            + pynini.closure(pynutil.delete("0"))
        )
        decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA

        graph_decimal = (
            graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
        )

        graph_integer = (
            pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")
        )

        graph_integer_only = graph_maj_singular + insert_space + graph_integer_one
        graph_integer_only |= graph_maj_plural + insert_space + graph_integer

        final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal

        # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
        # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
        # not accepted: 002, 00, 0,
        two_digits_fractional_part = (
            pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))
        ) @ (
            (pynutil.delete("0") + (NEMO_DIGIT - "0"))
            | ((NEMO_DIGIT - "0") + pynutil.insert("0"))
            | ((NEMO_DIGIT - "0") + NEMO_DIGIT)
        )

        graph_min_singular = pynutil.insert(" currency_min: \"") + min_singular + pynutil.insert("\"")
        graph_min_plural = pynutil.insert(" currency_min: \"") + min_plural + pynutil.insert("\"")
        # format ** dollars ** cent
        decimal_graph_with_minor = None
        integer_graph_reordered = None
        decimal_default_reordered = None
        for curr_symbol, _ in maj_singular_labels:
            preserve_order = pynutil.insert(" preserve_order: true")
            integer_plus_maj = graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural
            integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular

            integer_plus_maj_with_comma = pynini.compose(
                NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj
            )
            integer_plus_maj = pynini.compose(pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj)
            integer_plus_maj |= integer_plus_maj_with_comma

            graph_fractional_one = two_digits_fractional_part @ pynini.cross("1", "one")
            graph_fractional_one = pynutil.insert("fractional_part: \"") + graph_fractional_one + pynutil.insert("\"")
            graph_fractional = (
                two_digits_fractional_part
                @ (pynini.closure(NEMO_DIGIT, 1, 2) - "1")
                @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
            )
            graph_fractional = pynutil.insert("fractional_part: \"") + graph_fractional + pynutil.insert("\"")

            fractional_plus_min = graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural
            fractional_plus_min |= (
                graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular
            )

            decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(".", " ") + fractional_plus_min

            if not deterministic:
                decimal_graph_with_minor_curr |= pynutil.add_weight(
                    integer_plus_maj
                    + pynini.cross(".", " ")
                    + pynutil.insert("fractional_part: \"")
                    + two_digits_fractional_part @ cardinal.graph_hundred_component_at_least_one_none_zero_digit
                    + pynutil.insert("\""),
                    weight=0.0001,
                )
                default_fraction_graph = (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final
            decimal_graph_with_minor_curr |= (
                pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min
            )
            decimal_graph_with_minor_curr = (
                pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order
            )

            decimal_graph_with_minor = (
                decimal_graph_with_minor_curr
                if decimal_graph_with_minor is None
                else pynini.union(decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()
            )

            if not deterministic:
                integer_graph_reordered_curr = (
                    pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order
                ).optimize()

                integer_graph_reordered = (
                    integer_graph_reordered_curr
                    if integer_graph_reordered is None
                    else pynini.union(integer_graph_reordered, integer_graph_reordered_curr).optimize()
                )
                decimal_default_reordered_curr = (
                    pynutil.delete(curr_symbol)
                    + default_fraction_graph
                    + insert_space
                    + pynutil.insert(curr_symbol) @ graph_maj_plural
                )

                decimal_default_reordered = (
                    decimal_default_reordered_curr
                    if decimal_default_reordered is None
                    else pynini.union(decimal_default_reordered, decimal_default_reordered_curr)
                ).optimize()

        # weight for SH
        final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.001)

        if not deterministic:
            final_graph |= integer_graph_reordered | decimal_default_reordered
            # to handle "$2.00" cases
            final_graph |= pynini.compose(
                NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered
            )
        final_graph = self.add_tokens(final_graph.optimize())
        self.fst = final_graph.optimize()
Exemplo n.º 14
0
from nemo_text_processing.text_normalization.ar.graph_utils import (
    NEMO_ALPHA,
    NEMO_DIGIT,
    NEMO_SIGMA,
    SINGULAR_TO_PLURAL,
    GraphFst,
    convert_space,
    insert_space,
)
from nemo_text_processing.text_normalization.ar.utils import get_abs_path, load_labels

try:
    import pynini
    from pynini.lib import pynutil

    min_singular = pynini.string_file(get_abs_path("data/currency/currency_minor_singular.tsv"))
    min_plural = pynini.string_file(get_abs_path("data/currency/currency_minor_plural.tsv"))
    maj_singular = pynini.string_file((get_abs_path("data/currency/currency.tsv")))

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    PYNINI_AVAILABLE = False


class MoneyFst(GraphFst):
    """
    Finite state transducer for classifying money, suppletive aware, e.g. 
        $12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
        $12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: true }
        $1 -> money { currency_maj: "dollar" integer_part: "one" }
        $1.00 -> money { currency_maj: "dollar" integer_part: "one" }
Exemplo n.º 15
0
    NEMO_PUNCT = pynini.union(
        *map(pynini.escape, string.punctuation)).optimize()
    NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()

    NEMO_SIGMA = pynini.closure(NEMO_CHAR)

    delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
    insert_space = pynutil.insert(" ")
    delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
    delete_preserve_order = pynini.closure(
        pynutil.delete(" preserve_order: true")
        | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE +
           pynutil.delete("\"")))

    suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
    # _v = pynini.union("a", "e", "i", "o", "u")
    _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",
                      "p", "q", "r", "s", "t", "v", "w", "x", "y", "z")
    _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
    _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
                                    "z") + pynutil.insert("es")
    _s = NEMO_SIGMA + pynutil.insert("s")

    graph_plural = plurals._priority_union(
        suppletive,
        plurals._priority_union(_ies,
                                plurals._priority_union(_es, _s, NEMO_SIGMA),
                                NEMO_SIGMA), NEMO_SIGMA).optimize()

    SINGULAR_TO_PLURAL = graph_plural