Пример #1
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="measure", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_four = pynini.cross("tư", "4")
        graph_one = pynini.cross("mốt", "1")
        graph_half = pynini.cross("rưỡi", "5")

        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
        graph_unit_singular = pynini.invert(graph_unit)  # singular -> abbr

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") +
            pynini.cross(pynini.union("âm", "trừ"), '"true"') +
            delete_extra_space,
            0,
            1,
        )

        unit_singular = convert_space(graph_unit_singular)
        unit_misc = pynutil.insert("/") + pynutil.delete(
            "trên") + delete_space + convert_space(graph_unit_singular)

        unit_singular = (pynutil.insert('units: "') +
                         (unit_singular | unit_misc | pynutil.add_weight(
                             unit_singular + delete_space + unit_misc, 0.01)) +
                         pynutil.insert('"'))

        subgraph_decimal = (pynutil.insert("decimal { ") +
                            optional_graph_negative +
                            decimal.final_graph_wo_negative +
                            pynutil.insert(" }") + delete_extra_space +
                            unit_singular)

        subgraph_cardinal = (pynutil.insert("cardinal { ") +
                             optional_graph_negative +
                             pynutil.insert('integer: "') + cardinal_graph +
                             pynutil.insert('"') + pynutil.insert(" }") +
                             delete_extra_space + unit_singular)
        fraction_graph = (delete_extra_space +
                          pynutil.insert('fractional_part: "') +
                          (graph_digit | graph_half | graph_one | graph_four) +
                          pynutil.insert('"'))

        subgraph_cardinal |= (pynutil.insert("cardinal { ") +
                              optional_graph_negative +
                              pynutil.insert('integer: "') + cardinal_graph +
                              pynutil.insert('" }') + delete_extra_space +
                              unit_singular + fraction_graph)
        final_graph = subgraph_decimal | subgraph_cardinal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Пример #2
0
    def __init__(self):
        super().__init__(name="electronic", kind="classify")

        delete_extra_space = pynutil.delete(" ")
        alpha_num = (
            NEMO_ALPHA
            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
            | pynini.string_file(get_abs_path("data/numbers/zero.tsv")))

        symbols = pynini.string_file(
            get_abs_path("data/electronic/symbols.tsv")).invert()

        accepted_username = alpha_num | symbols
        process_dot = pynini.cross("chấm", ".")
        username = (pynutil.insert('username: "******"'))
        single_alphanum = pynini.closure(alpha_num +
                                         delete_extra_space) + alpha_num
        server = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/server_name.tsv"))
        domain = single_alphanum | pynini.string_file(
            get_abs_path("data/electronic/domain.tsv"))
        multi_domain = (pynini.closure(process_dot + delete_extra_space +
                                       domain + delete_extra_space) +
                        process_dot + delete_extra_space + domain)
        domain_graph = pynutil.insert(
            'domain: "'
        ) + server + delete_extra_space + multi_domain + pynutil.insert('"')
        graph = (username + delete_extra_space +
                 pynutil.delete(pynini.union("a còng", "a móc", "a vòng")) +
                 insert_space + delete_extra_space + domain_graph)

        ############# url ###
        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
        protocol_start = (pynini.cross("h t t p", "http") | pynini.cross(
            "h t t p s", "https")) + pynini.cross(" hai chấm sẹc sẹc ", "://")
        # .com,
        ending = (
            delete_extra_space + symbols + delete_extra_space +
            (domain | pynini.closure(accepted_username + delete_extra_space) +
             accepted_username))

        protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end +
                    delete_extra_space + process_dot +
                    pynini.closure(delete_extra_space + accepted_username, 1) +
                    pynini.closure(ending, 1, 2))
        protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert(
            '"')
        graph |= protocol
        ########

        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Пример #3
0
def _get_month_graph():
    """
    Transducer for month, e.g. march -> march
    """
    month_graph = pynini.string_file(
        get_abs_path("data/months.tsv")).optimize()
    return month_graph
Пример #4
0
    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
        super().__init__(name="money", kind="classify")
        # quantity, integer_part, fractional_part, currency

        cardinal_graph = cardinal.graph_no_exception
        graph_decimal_final = decimal.final_graph_wo_negative
        graph_half = pynini.cross("rưỡi", "5")

        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
        unit_singular = pynini.invert(unit)

        graph_unit_singular = pynutil.insert("currency: \"") + convert_space(
            unit_singular) + pynutil.insert("\"")

        add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (
            pynutil.insert("0") + NEMO_DIGIT)

        # twelve dollars fifty, only after integer
        optional_cents_suffix = pynini.closure(
            delete_extra_space + pynutil.insert("fractional_part: \"") +
            (pynutil.add_weight(
                cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
             | graph_half) + pynutil.insert("\""),
            0,
            1,
        )

        graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph +
                         pynutil.insert("\"") + delete_extra_space +
                         graph_unit_singular + optional_cents_suffix)

        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + optional_cents_suffix
        final_graph = graph_integer | graph_decimal
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Пример #5
0
    def __init__(self):
        super().__init__(name="telephone", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        digit = graph_digit | graph_zero
        last_digit = digit | pynini.cross("mốt", "1") | pynini.cross(
            "tư", "4") | pynini.cross("lăm", "5")

        graph_number_part = pynini.closure(digit + delete_space,
                                           2) + last_digit
        number_part = pynutil.insert(
            'number_part: "') + graph_number_part + pynutil.insert('"')

        graph = number_part
        final_graph = self.add_tokens(graph)
        self.fst = final_graph.optimize()
Пример #6
0
    def __init__(self):
        super().__init__(name="whitelist", kind="classify")

        whitelist = pynini.string_file(
            get_abs_path("data/whitelist.tsv")).invert()
        graph = pynutil.insert('name: "') + convert_space(
            whitelist) + pynutil.insert('"')
        self.fst = graph.optimize()
Пример #7
0
    def __init__(self):
        super().__init__(name="ordinal", kind="classify")

        graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
        graph_ordinal = pynini.cross("thứ", "")
        graph = graph_digit

        self.graph = graph
        final_graph = pynutil.insert("integer: \"") + graph_ordinal + delete_space + self.graph + pynutil.insert("\"")
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Пример #8
0
    def __init__(self, cardinal: GraphFst):
        super().__init__(name="decimal", kind="classify")

        cardinal_graph = cardinal.graph_no_exception

        graph_decimal = graph_digit | pynini.string_file(
            get_abs_path("data/numbers/zero.tsv"))
        graph_one = pynini.cross("mốt", "1")
        graph_four = pynini.cross("tư", "4")
        graph_five = pynini.cross("lăm", "5")

        graph_decimal = pynini.union(
            graph_decimal,
            graph_four,
            pynini.closure(graph_decimal + delete_space, 1) +
            (graph_decimal | graph_four | graph_five | graph_one),
        )
        self.graph = graph_decimal

        point = pynutil.delete("chấm") | pynutil.delete("phẩy")

        optional_graph_negative = pynini.closure(
            pynutil.insert("negative: ") +
            pynini.cross(pynini.union("âm", "trừ"), '"true"') +
            delete_extra_space,
            0,
            1,
        )

        graph_fractional = pynutil.insert(
            'fractional_part: "') + graph_decimal + pynutil.insert('"')
        graph_integer = pynutil.insert(
            'integer_part: "') + cardinal_graph + pynutil.insert('"')
        final_graph_wo_sign = (
            pynini.closure(graph_integer + delete_extra_space, 0, 1) + point +
            delete_extra_space + graph_fractional)
        final_graph = optional_graph_negative + final_graph_wo_sign

        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
            final_graph_wo_sign,
            cardinal.graph_hundred_component_at_least_one_none_zero_digit,
        )
        final_graph |= optional_graph_negative + get_quantity(
            final_graph_wo_sign,
            cardinal.graph_hundred_component_at_least_one_none_zero_digit,
        )
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Пример #9
0
    def __init__(self):
        super().__init__(name="cardinal", kind="classify")
        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
        graph_digit = pynini.string_file(
            get_abs_path("data/numbers/digit.tsv"))
        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))

        graph_one = pynini.cross("mốt", "1")
        graph_four = pynini.cross("tư", "4")
        graph_five = pynini.cross("lăm", "5")
        graph_half = pynini.cross("rưỡi", "5")
        graph_hundred = pynini.cross("trăm", "")
        graph_ten = pynini.cross("mươi", "")
        zero = pynini.cross(pynini.union("linh", "lẻ"), "0")

        optional_ten = pynini.closure(delete_space + graph_ten, 0, 1)
        last_digit = graph_digit | graph_one | graph_four | graph_five

        graph_hundred_component = (graph_digit
                                   | graph_zero) + delete_space + graph_hundred
        graph_hundred_component += delete_space
        graph_hundred_component += pynini.union(
            graph_teen,
            graph_ties + optional_ten +
            ((delete_space + last_digit) | pynutil.insert("0")),
            (graph_half | graph_four | graph_one) + pynutil.insert("0"),
            zero + delete_space + (graph_digit | graph_four),
            graph_digit,
            pynutil.insert("00"),
        )
        graph_hundred_component |= (
            pynutil.insert("0") + delete_space + pynini.union(
                graph_teen,
                graph_ties + optional_ten +
                ((delete_space + last_digit) | pynutil.insert("0")),
                zero + delete_space + (graph_digit | graph_four),
                graph_digit,
            ))

        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
            pynini.closure(NEMO_DIGIT) +
            (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT))
        self.graph_hundred_component_at_least_one_none_zero_digit = (
            graph_hundred_component_at_least_one_none_zero_digit)

        graph_thousands = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")),
            pynutil.insert("000", weight=0.1),
        )

        graph_ten_thousand = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("vạn"),
            pynutil.insert("0000", weight=0.1),
        )

        graph_ten_thousand_suffix = pynini.union(
            graph_digit + delete_space +
            pynutil.delete(pynini.union("nghìn", "ngàn")),
            pynutil.insert("0", weight=0.1),
        )

        graph_million = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete("triệu"),
            pynutil.insert("000", weight=0.1),
        )
        graph_billion = pynini.union(
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete(pynini.union("tỉ", "tỷ")),
            pynutil.insert("000", weight=0.1),
        )

        graph = pynini.union(
            graph_billion + delete_space + graph_million + delete_space +
            graph_thousands + delete_space + graph_hundred_component,
            graph_ten_thousand + delete_space + graph_ten_thousand_suffix +
            delete_space + graph_hundred_component,
            graph_hundred_component_at_least_one_none_zero_digit +
            delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) +
            delete_space +
            ((last_digit + pynutil.insert("00")) | graph_hundred_component),
            graph_zero,
        )

        graph = graph @ pynini.union(
            pynutil.delete(pynini.closure("0")) + pynini.difference(
                NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0")

        # don't convert cardinals from zero to nine inclusive
        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero),
                                         'input')

        self.graph_no_exception = graph

        self.graph = (pynini.project(graph, "input") -
                      graph_exception.arcsort()) @ graph

        optional_minus_graph = pynini.closure(
            pynutil.insert("negative: ") +
            pynini.cross(pynini.union("âm", "trừ"), "\"-\"") + NEMO_SPACE, 0,
            1)

        final_graph = optional_minus_graph + pynutil.insert(
            "integer: \"") + self.graph + pynutil.insert("\"")

        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
Пример #10
0
    def __init__(self):
        super().__init__(name="time", kind="classify")
        # hours, minutes, seconds, suffix, zone, style, speak_period

        graph_hours_to = pynini.string_file(
            get_abs_path("data/time/hours_to.tsv"))
        graph_minutes_to = pynini.string_file(
            get_abs_path("data/time/minutes_to.tsv"))
        graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv"))
        graph_minutes = pynini.string_file(
            get_abs_path("data/time/minutes.tsv"))
        time_zone_graph = pynini.invert(
            pynini.string_file(get_abs_path("data/time/time_zone.tsv")))

        graph_half = pynini.cross("rưỡi", "30")
        oclock = pynini.cross("giờ", "")
        minute = pynini.cross("phút", "")
        optional_minute = pynini.closure(delete_space + minute, 0, 1)
        second = pynini.cross("giây", "")

        final_graph_hour = pynutil.insert(
            "hours: \"") + graph_hours + pynutil.insert(
                "\"") + delete_space + oclock
        graph_minute = graph_minutes + optional_minute
        graph_second = graph_minute + delete_space + second
        final_time_zone_optional = pynini.closure(
            delete_space + insert_space + pynutil.insert("zone: \"") +
            convert_space(time_zone_graph) + pynutil.insert("\""),
            0,
            1,
        )

        graph_hm = (final_graph_hour + delete_extra_space +
                    pynutil.insert("minutes: \"") +
                    (graph_minute | graph_half) + pynutil.insert("\""))

        graph_hms = graph_hm + delete_extra_space + pynutil.insert(
            "seconds: \"") + graph_second + pynutil.insert("\"")

        graph_ms = (pynutil.insert("minutes: \"") + graph_minute +
                    pynutil.insert("\"") + delete_extra_space +
                    pynutil.insert("seconds: \"") +
                    (graph_second | graph_half) + pynutil.insert("\""))

        graph_hours_to_component = graph_hours @ graph_hours_to
        graph_minutes_to_component = graph_minutes @ graph_minutes_to

        graph_time_to = (pynutil.insert("hours: \"") +
                         graph_hours_to_component + pynutil.insert("\"") +
                         delete_space + oclock + delete_space +
                         pynutil.delete("kém") + delete_extra_space +
                         pynutil.insert("minutes: \"") +
                         graph_minutes_to_component + optional_minute +
                         pynutil.insert("\""))

        final_graph = (final_graph_hour | graph_hm
                       | graph_hms) + final_time_zone_optional
        final_graph |= graph_ms
        final_graph |= graph_time_to

        final_graph = self.add_tokens(final_graph)

        self.fst = final_graph.optimize()
Пример #11
0
    NEMO_DIGIT,
    GraphFst,
    delete_extra_space,
    delete_space,
)
from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    PYNINI_AVAILABLE = False

graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))


def get_quantity(decimal: 'pynini.FstLike',
                 cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike':
    """
    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
    e.g. một triệu -> integer_part: "1" quantity: "triệu"
    e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ"

    Args:
        decimal: decimal FST
        cardinal_up_to_hundred: cardinal FST
    """
    numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) +
                                        pynini.difference(NEMO_DIGIT, "0") +
Пример #12
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space
from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize()
    graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
    graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize()
    ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    graph_teen = None
    graph_digit = None
    graph_zero = None
    ties_graph = None

    PYNINI_AVAILABLE = True


def _get_month_graph():
Пример #13
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_text_processing.inverse_text_normalization.vi.graph_utils import GraphFst, delete_extra_space, delete_space
from nemo_text_processing.inverse_text_normalization.vi.utils import get_abs_path

try:
    import pynini
    from pynini.lib import pynutil

    graph_teen = pynini.string_file(
        get_abs_path("data/numbers/teen.tsv")).optimize()
    graph_digit = pynini.string_file(
        get_abs_path("data/numbers/digit.tsv")).optimize()
    graph_zero = pynini.string_file(
        get_abs_path("data/numbers/zero.tsv")).optimize()
    ties_graph = pynini.string_file(
        get_abs_path("data/numbers/ties.tsv")).optimize()

    PYNINI_AVAILABLE = True
except (ModuleNotFoundError, ImportError):
    graph_teen = None
    graph_digit = None
    graph_zero = None
    ties_graph = None

    PYNINI_AVAILABLE = True