def _get_whitelist_non_deterministic_graph( file="data/whitelist_alternatives.tsv"): whitelist = load_labels(get_abs_path(file)) whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] whitelist_cased = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist_lower + whitelist_cased) return graph
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { serial { value: "c three two five b" } } """ alpha = NEMO_ALPHA if self.deterministic: num_graph = self.single_digits_graph else: num_graph = self.graph letter_pronunciation = pynini.string_map( load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter) + next_alpha_or_num if not self.deterministic: serial_graph += pynini.closure( pynini.accep("s") | pynini.cross("s", "es"), 0, 1) return serial_graph
def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) return graph
def __init__(self, input_case: str): super().__init__(name="whitelist", kind="classify") whitelist = load_labels(get_abs_path("data/whitelist.tsv")) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] else: whitelist = [(x, y) for x, y in whitelist] graph = pynini.string_map(whitelist) graph = pynutil.insert("name: \"") + convert_space(graph) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) month_graph = pynini.string_file(get_abs_path("data/months/names.tsv")).optimize() month_graph |= (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_graph month_abbr_graph = pynini.string_file(get_abs_path("data/months/abbr.tsv")).optimize() month_abbr_graph = ( month_abbr_graph | (TO_LOWER + pynini.closure(NEMO_CHAR)) @ month_abbr_graph ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph # to support all caps names names_all_caps = [[x[0].upper()] for x in load_labels(get_abs_path("data/months/names.tsv"))] abbr_all_caps = [(x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))] month_graph |= pynini.string_map(names_all_caps) | ( pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1) ) month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) YEAR_WEIGHT = 0.001 year_graph_standalone = ( pynutil.insert("year: \"") + pynutil.add_weight(year_graph, YEAR_WEIGHT) + pynutil.insert("\"") ) month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_graph + pynutil.insert("\"") day_graph = ( pynutil.insert("day: \"") + ((pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT) @ cardinal_graph + pynutil.insert("\"") ) optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") optional_graph_year = pynini.closure(delete_extra_space + year_graph, 0, 1,) graph_mdy = ( month_graph + optional_day_graph + delete_space + pynini.closure(pynutil.delete(","), 0, 1) + optional_graph_year ) delete_sep = pynutil.delete(pynini.union("-", "/", ".")) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + year_graph ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_ymd = ( year_graph + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = (graph_mdy | graph_dmy) + pynutil.insert(" preserve_order: true") final_graph |= graph_ymd | year_graph_standalone final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _load_roman(file: str): roman = load_labels(get_abs_path(file)) roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman] return pynini.string_map(roman_numerals)