示例#1
0
 def _get_whitelist_non_deterministic_graph(
         file="data/whitelist_alternatives.tsv"):
     whitelist = load_labels(get_abs_path(file))
     whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist]
     whitelist_cased = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist_lower + whitelist_cased)
     return graph
示例#2
0
    def get_serial_graph(self):
        """
        Finite state transducer for classifying serial.
            The serial is a combination of digits, letters and dashes, e.g.:
            c325-b -> tokens { serial { value: "c three two five b" } }
        """
        alpha = NEMO_ALPHA

        if self.deterministic:
            num_graph = self.single_digits_graph
        else:
            num_graph = self.graph
            letter_pronunciation = pynini.string_map(
                load_labels(get_abs_path("data/letter_pronunciation.tsv")))
            alpha |= letter_pronunciation

        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross(
            "/", " ")
        letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
        num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
        next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))
        serial_graph = (letter_num | num_letter) + next_alpha_or_num

        if not self.deterministic:
            serial_graph += pynini.closure(
                pynini.accep("s") | pynini.cross("s", "es"), 0, 1)
        return serial_graph
示例#3
0
 def _get_whitelist_graph(input_case, file="data/whitelist.tsv"):
     whitelist = load_labels(get_abs_path(file))
     if input_case == "lower_cased":
         whitelist = [(x.lower(), y) for x, y in whitelist]
     else:
         whitelist = [(x, y) for x, y in whitelist]
     graph = pynini.string_map(whitelist)
     return graph
示例#4
0
    def __init__(self, input_case: str):
        super().__init__(name="whitelist", kind="classify")

        whitelist = load_labels(get_abs_path("data/whitelist.tsv"))
        if input_case == "lower_cased":
            whitelist = [(x.lower(), y) for x, y in whitelist]
        else:
            whitelist = [(x, y) for x, y in whitelist]

        graph = pynini.string_map(whitelist)

        graph = pynutil.insert("name: \"") + convert_space(graph) + pynutil.insert("\"")
        self.fst = graph.optimize()
示例#5
0
文件: date.py 项目: piraka9011/NeMo
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date", kind="classify", deterministic=deterministic)

        month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize()
        month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph
        month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize()
        month_abbr_graph = (
            month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph
        ) + pynini.closure(pynutil.delete("."), 0, 1)
        month_graph |= month_abbr_graph

        # to support all caps names
        names_all_caps = [[x[0].upper()] for x in load_labels(get_abs_path("data/months/names.tsv"))]
        abbr_all_caps = [(x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))]
        month_graph |= pynini.string_map(names_all_caps) | (
            pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1)
        )

        month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize()
        cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit

        year_graph = _get_year_graph(deterministic)

        YEAR_WEIGHT = 0.001
        year_graph_standalone = (
            pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"")
        )

        month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
        month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"")

        day_graph = (
            pynutil.insert("day: \"")
            + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph
            + pynutil.insert("\"")
        )
        optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1)

        year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"")
        optional_graph_year = pynini.closure(delete_extra_space + year_graph, 0, 1,)
        graph_mdy = (
            month_graph
            + optional_day_graph
            + delete_space
            + pynini.closure(pynutil.delete(","), 0, 1)
            + optional_graph_year
        )

        delete_sep = pynutil.delete(pynini.union("-", "/", "."))
        graph_mdy |= (
            month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
            + delete_sep
            + insert_space
            + year_graph
        )

        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
        graph_ymd = (
            year_graph
            + delete_sep
            + insert_space
            + month_numbers_graph
            + delete_sep
            + insert_space
            + pynini.closure(pynutil.delete("0"), 0, 1)
            + day_graph
        )

        final_graph = (graph_mdy | graph_dmy) + pynutil.insert(" preserve_order: true")
        final_graph |= graph_ymd | year_graph_standalone
        final_graph = self.add_tokens(final_graph)
        self.fst = final_graph.optimize()
示例#6
0
文件: roman.py 项目: piraka9011/NeMo
 def _load_roman(file: str):
     roman = load_labels(get_abs_path(file))
     roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y)
                                                    for x, y in roman]
     return pynini.string_map(roman_numerals)