Exemplo n.º 1
0
    def __init__(self,
                 input_case: str,
                 deterministic: bool = True,
                 input_file: str = None):
        super().__init__(name="whitelist",
                         kind="classify",
                         deterministic=deterministic)

        def _get_whitelist_graph(input_case, file):
            whitelist = load_labels(file)
            if input_case == "lower_cased":
                whitelist = [[x[0].lower()] + x[1:] for x in whitelist]
            graph = pynini.string_map(whitelist)
            return graph

        graph = _get_whitelist_graph(input_case,
                                     get_abs_path("data/whitelist.tsv"))
        if not deterministic and input_case != "lower_cased":
            graph |= pynutil.add_weight(_get_whitelist_graph(
                "lower_cased", get_abs_path("data/whitelist.tsv")),
                                        weight=0.0001)

        if input_file:
            whitelist_provided = _get_whitelist_graph(input_case, input_file)
            if not deterministic:
                graph |= whitelist_provided
            else:
                graph = whitelist_provided

        if not deterministic:
            units_graph = _get_whitelist_graph(
                input_case,
                file=get_abs_path("data/measures/measurements.tsv"))
            graph |= units_graph

        self.graph = graph
        self.final_graph = convert_space(self.graph).optimize()
        self.fst = (pynutil.insert("name: \"") + self.final_graph +
                    pynutil.insert("\"")).optimize()
Exemplo n.º 2
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, insert_space
from nemo_text_processing.text_normalization.es.graph_utils import ones
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
    graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
    graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
    graph_twenties = pynini.string_file(
        get_abs_path("data/numbers/twenties.tsv"))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    graph_digit = None
    graph_ties = None
    graph_teen = None
    graph_twenties = None

    PYNINI_AVAILABLE = False
Exemplo n.º 3
0
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_NOT_QUOTE,
    NEMO_SIGMA,
    GraphFst,
    delete_preserve_order,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    alt_minutes = pynini.string_file(get_abs_path("data/time/alt_minutes.tsv"))

    morning_times = pynini.string_file(get_abs_path("data/time/morning_times.tsv"))
    afternoon_times = pynini.string_file(get_abs_path("data/time/afternoon_times.tsv"))
    evening_times = pynini.string_file(get_abs_path("data/time/evening_times.tsv"))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    alt_minutes = None

    morning_times = None
    afternoon_times = None
    evening_times = None

    PYNINI_AVAILABLE = False
Exemplo n.º 4
0
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_NOT_QUOTE,
    NEMO_SIGMA,
    GraphFst,
    delete_preserve_order,
    insert_space,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    digit_no_zero = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
    zero = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

    graph_symbols = pynini.string_file(
        get_abs_path("data/electronic/symbols.tsv"))
    server_common = pynini.string_file(
        get_abs_path("data/electronic/server_name.tsv"))
    domain_common = pynini.string_file(
        get_abs_path("data/electronic/domain.tsv"))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    digit_no_zero = None
    zero = None
Exemplo n.º 5
0
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.es.graph_utils import (
    cardinal_separator,
    decimal_separator,
    strip_cardinal_apocope,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    quantities = pynini.string_file(
        get_abs_path("data/numbers/quantities.tsv"))
    digit = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/digit.tsv")))
    zero = pynini.invert(
        pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    quantities = None
    digit = None
    zero = None

    PYNINI_AVAILABLE = False

Exemplo n.º 6
0
    NEMO_SIGMA,
    NEMO_SPACE,
    NEMO_WHITE_SPACE,
    GraphFst,
    delete_extra_space,
    delete_preserve_order,
)
from nemo_text_processing.text_normalization.es.graph_utils import ones
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    unit_plural_fem = pynini.string_file(
        get_abs_path("data/measures/measurements_plural_fem.tsv"))
    unit_plural_masc = pynini.string_file(
        get_abs_path("data/measures/measurements_plural_masc.tsv"))

    unit_singular_fem = pynini.project(unit_plural_fem, "input")
    unit_singular_masc = pynini.project(unit_plural_masc, "input")

    unit_plural_fem = pynini.project(unit_plural_fem, "output")
    unit_plural_masc = pynini.project(unit_plural_masc, "output")

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    unit_plural_fem = None
    unit_plural_masc = None
Exemplo n.º 7
0
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_CHAR,
    NEMO_SIGMA,
    NEMO_SPACE,
    GraphFst,
    delete_space,
)
from nemo_text_processing.text_normalization.es.graph_utils import roman_to_int, strip_accent
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    digit = pynini.invert(
        pynini.string_file(get_abs_path("data/ordinals/digit.tsv")))
    teens = pynini.invert(
        pynini.string_file(get_abs_path("data/ordinals/teen.tsv")))
    twenties = pynini.invert(
        pynini.string_file(get_abs_path("data/ordinals/twenties.tsv")))
    ties = pynini.invert(
        pynini.string_file(get_abs_path("data/ordinals/ties.tsv")))
    hundreds = pynini.invert(
        pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv")))

    PYNINI_AVAILABLE = True

except (ImportError, ModuleNotFoundError):
    digit = None
    teens = None
    twenties = None
Exemplo n.º 8
0
    NEMO_NON_BREAKING_SPACE,
    NEMO_SIGMA,
    NEMO_SPACE,
    GraphFst,
    convert_space,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.es.graph_utils import strip_cardinal_apocope
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    unit = pynini.string_file(get_abs_path("data/measures/measurements.tsv"))
    unit_plural_fem = pynini.string_file(get_abs_path("data/measures/measurements_plural_fem.tsv"))
    unit_plural_masc = pynini.string_file(get_abs_path("data/measures/measurements_plural_masc.tsv"))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    unit = None
    unit_plural_fem = None
    unit_plural_masc = None

    PYNINI_AVAILABLE = False


class MeasureFst(GraphFst):
    """
Exemplo n.º 9
0
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_CHAR,
    NEMO_DIGIT,
    NEMO_SIGMA,
    NEMO_SPACE,
    GraphFst,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    ordinal_exceptions = pynini.string_file(
        get_abs_path("data/fractions/ordinal_exceptions.tsv"))
    higher_powers_of_ten = pynini.string_file(
        get_abs_path("data/fractions/powers_of_ten.tsv"))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    ordinal_exceptions = None
    higher_powers_of_ten = None

    PYNINI_AVAILABLE = False


class FractionFst(GraphFst):
    """
    Finite state transducer for classifying fraction
Exemplo n.º 10
0
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_NOT_QUOTE,
    NEMO_SIGMA,
    NEMO_SPACE,
    GraphFst,
    delete_preserve_order,
)
from nemo_text_processing.text_normalization.es.graph_utils import shift_cardinal_gender, strip_cardinal_apocope
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    fem = pynini.string_file(
        (get_abs_path("data/money/currency_plural_fem.tsv")))
    masc = pynini.string_file(
        (get_abs_path("data/money/currency_plural_masc.tsv")))

    fem_singular = pynini.project(fem, "input")
    masc_singular = pynini.project(masc, "input")

    fem_plural = pynini.project(fem, "output")
    masc_plural = pynini.project(masc, "output")

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    fem_plural = None
    masc_plural = None
Exemplo n.º 11
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

try:
    import pynini
    from pynini.lib import pynutil

    common_domains = [
        x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))
    ]
    symbols = [
        x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))
    ]

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    common_domains = None
    symbols = None

    PYNINI_AVAILABLE = False


class ElectronicFst(GraphFst):
Exemplo n.º 12
0
    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
        super().__init__(name="time",
                         kind="classify",
                         deterministic=deterministic)

        delete_time_delimiter = pynutil.delete(pynini.union(".", ":"))

        one = pynini.string_map([("un", "una"), ("ún", "una")])
        change_one = pynini.cdrewrite(one, "", "", NEMO_SIGMA)
        cardinal_graph = cardinal.graph @ change_one

        day_suffix = pynutil.insert("suffix: \"") + suffix + pynutil.insert(
            "\"")
        day_suffix = delete_space + insert_space + day_suffix

        delete_hora_suffix = delete_space + insert_space + pynutil.delete("h")
        delete_minute_suffix = delete_space + insert_space + pynutil.delete(
            "min")
        delete_second_suffix = delete_space + insert_space + pynutil.delete(
            "s")

        labels_hour_24 = [
            str(x) for x in range(0, 25)
        ]  # Can see both systems. Twelve hour requires am/pm for ambiguity resolution
        labels_hour_12 = [str(x) for x in range(1, 13)]
        labels_minute_single = [str(x) for x in range(1, 10)]
        labels_minute_double = [str(x) for x in range(10, 60)]

        delete_leading_zero_to_double_digit = (
            pynini.closure(pynutil.delete("0") |
                           (NEMO_DIGIT - "0"), 0, 1) + NEMO_DIGIT)

        graph_24 = (pynini.closure(NEMO_DIGIT, 1,
                                   2) @ delete_leading_zero_to_double_digit
                    @ pynini.union(*labels_hour_24))
        graph_12 = (pynini.closure(NEMO_DIGIT, 1,
                                   2) @ delete_leading_zero_to_double_digit
                    @ pynini.union(*labels_hour_12))

        graph_hour_24 = graph_24 @ cardinal_graph
        graph_hour_12 = graph_12 @ cardinal_graph

        graph_minute_single = delete_leading_zero_to_double_digit @ pynini.union(
            *labels_minute_single)
        graph_minute_double = pynini.union(*labels_minute_double)

        graph_minute = pynini.union(graph_minute_single,
                                    graph_minute_double) @ cardinal_graph

        final_graph_hour_only_24 = (pynutil.insert("hours: \"") +
                                    graph_hour_24 + pynutil.insert("\"") +
                                    delete_hora_suffix)
        final_graph_hour_only_12 = pynutil.insert(
            "hours: \"") + graph_hour_12 + pynutil.insert("\"") + day_suffix

        final_graph_hour_24 = pynutil.insert(
            "hours: \"") + graph_hour_24 + pynutil.insert("\"")
        final_graph_hour_12 = pynutil.insert(
            "hours: \"") + graph_hour_12 + pynutil.insert("\"")

        final_graph_minute = pynutil.insert(
            "minutes: \"") + graph_minute + pynutil.insert("\"")
        final_graph_second = pynutil.insert(
            "seconds: \"") + graph_minute + pynutil.insert("\"")
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            time_zone_graph + pynutil.insert("\""),
            0,
            1,
        )

        # 02.30 h
        graph_hm = (
            final_graph_hour_24 + delete_time_delimiter +
            (pynutil.delete("00") |
             (insert_space + final_graph_minute)) + pynini.closure(
                 delete_time_delimiter +
                 (pynini.cross("00", " seconds: \"0\"") |
                  (insert_space + final_graph_second)),
                 0,
                 1,
             )  # For seconds 2.30.35 h
            + pynini.closure(delete_hora_suffix, 0,
                             1)  # 2.30 is valid if unambiguous
            + final_time_zone_optional)

        # 2 h 30 min
        graph_hm |= (
            final_graph_hour_24 + delete_hora_suffix + delete_space +
            (pynutil.delete("00") | (insert_space + final_graph_minute)) +
            delete_minute_suffix + pynini.closure(
                delete_space +
                (pynini.cross("00", " seconds: \"0\"") |
                 (insert_space + final_graph_second)) + delete_second_suffix,
                0,
                1,
            )  # For seconds
            + final_time_zone_optional)

        # 2.30 a. m. (Only for 12 hour clock)
        graph_hm |= (
            final_graph_hour_12 + delete_time_delimiter +
            (pynutil.delete("00") |
             (insert_space + final_graph_minute)) + pynini.closure(
                 delete_time_delimiter +
                 (pynini.cross("00", " seconds: \"0\"") |
                  (insert_space + final_graph_second)),
                 0,
                 1,
             )  # For seconds 2.30.35 a. m.
            + day_suffix + final_time_zone_optional)

        graph_h = (
            pynini.union(final_graph_hour_only_24, final_graph_hour_only_12) +
            final_time_zone_optional
        )  # Should always have a time indicator, else we'll pass to cardinals

        if not deterministic:
            # This includes alternate vocalization (hour menos min, min para hour), here we shift the times and indicate a `style` tag
            hour_shift_24 = pynini.invert(
                pynini.string_file(get_abs_path("data/time/hour_to_24.tsv")))
            hour_shift_12 = pynini.invert(
                pynini.string_file(get_abs_path("data/time/hour_to_12.tsv")))
            minute_shift = pynini.string_file(
                get_abs_path("data/time/minute_to.tsv"))

            graph_hour_to_24 = graph_24 @ hour_shift_24 @ cardinal_graph
            graph_hour_to_12 = graph_12 @ hour_shift_12 @ cardinal_graph

            graph_minute_to = pynini.union(
                graph_minute_single,
                graph_minute_double) @ minute_shift @ cardinal_graph

            final_graph_hour_to_24 = pynutil.insert(
                "hours: \"") + graph_hour_to_24 + pynutil.insert("\"")
            final_graph_hour_to_12 = pynutil.insert(
                "hours: \"") + graph_hour_to_12 + pynutil.insert("\"")

            final_graph_minute_to = pynutil.insert(
                "minutes: \"") + graph_minute_to + pynutil.insert("\"")

            graph_menos = pynutil.insert(" style: \"1\"")
            graph_para = pynutil.insert(" style: \"2\"")

            final_graph_style = graph_menos | graph_para

            # 02.30 h (omitting seconds since a bit awkward)
            graph_hm |= (
                final_graph_hour_to_24 + delete_time_delimiter +
                insert_space + final_graph_minute_to + pynini.closure(
                    delete_hora_suffix, 0, 1)  # 2.30 is valid if unambiguous
                + final_time_zone_optional + final_graph_style)

            # 2 h 30 min
            graph_hm |= (final_graph_hour_to_24 + delete_hora_suffix +
                         delete_space + insert_space + final_graph_minute_to +
                         delete_minute_suffix + final_time_zone_optional +
                         final_graph_style)

            # 2.30 a. m. (Only for 12 hour clock)
            graph_hm |= (final_graph_hour_to_12 + delete_time_delimiter +
                         insert_space + final_graph_minute_to + day_suffix +
                         final_time_zone_optional + final_graph_style)

        final_graph = graph_hm | graph_h
        if deterministic:
            final_graph = final_graph + pynutil.insert(" preserve_order: true")
        final_graph = final_graph.optimize()
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Exemplo n.º 13
0
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import (
    NEMO_DIGIT,
    NEMO_SIGMA,
    GraphFst,
    delete_space,
    insert_space,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    time_zone_graph = pynini.string_file(
        get_abs_path("data/time/time_zone.tsv"))
    suffix = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))

    PYNINI_AVAILABLE = True

except (ModuleNotFoundError, ImportError):
    time_zone_graph = None
    suffix = None

    PYNINI_AVAILABLE = False


class TimeFst(GraphFst):
    """
    Finite state transducer for classifying time, e.g.
        "02:15 est" -> time { hours: "dos" minutes: "quince" zone: "e s t"}
Exemplo n.º 14
0
 def _load_roman(file: str):
     roman = load_labels(get_abs_path(file))
     roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y)
                                                    for x, y in roman]
     return pynini.string_map(roman_numerals)
Exemplo n.º 15
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, NEMO_SPACE
from nemo_text_processing.text_normalization.es import LOCALIZATION
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

try:
    import pynini
    from pynini.lib import pynutil

    digits = pynini.project(
        pynini.string_file(get_abs_path("data/numbers/digit.tsv")), "input")
    tens = pynini.project(
        pynini.string_file(get_abs_path("data/numbers/ties.tsv")), "input")
    teens = pynini.project(
        pynini.string_file(get_abs_path("data/numbers/teen.tsv")), "input")
    twenties = pynini.project(
        pynini.string_file(get_abs_path("data/numbers/twenties.tsv")), "input")
    hundreds = pynini.project(
        pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")), "input")

    accents = pynini.string_map([("á", "a"), ("é", "e"), ("í", "i"),
                                 ("ó", "o"), ("ú", "u")])

    if LOCALIZATION == "am":  # Setting localization for central and northern america formatting
        cardinal_separator = pynini.string_map([",", NEMO_SPACE])
        decimal_separator = pynini.accep(".")