def __init__(self): super().__init__(name="electronic", kind="verbalize") user_name = (pynutil.delete("username:"******"\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) protocol = (pynutil.delete("protocol:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) graph = user_name + delete_space + pynutil.insert("@") + domain graph |= protocol delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, decimal: GraphFst, cardinal: GraphFst): super().__init__(name="measure", kind="verbalize") optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"), 0, 1) unit = (pynutil.delete("units:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") + delete_space) graph_decimal = (pynutil.delete("decimal {") + delete_space + optional_sign + delete_space + decimal.numbers + delete_space + pynutil.delete("}")) graph_cardinal = (pynutil.delete("cardinal {") + delete_space + optional_sign + delete_space + cardinal.numbers + delete_space + pynutil.delete("}")) graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def _get_ties_graph(): """ Transducer for 20-99 e.g hai ba -> 23 """ graph_one = pynini.cross("mốt", "1") graph_four = pynini.cross("tư", "4") graph_five = pynini.cross("lăm", "5") graph_ten = pynini.cross("mươi", "") optional_ten = pynini.closure(delete_space + graph_ten, 0, 1) graph = ( ties_graph + optional_ten + ((delete_space + (graph_digit | graph_one | graph_four | graph_five)) | pynutil.insert("0")) ) return graph
def __init__(self): super().__init__(name="time", kind="verbalize") hour = (pynutil.delete("hours: ") + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) minutes = (pynutil.delete("minutes: ") + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) graph_preserve_order = pynutil.delete("hours: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") # for cases that require permutations for the correct verbalization graph_reverse_order = hour + delete_space + pynutil.insert( ":") + minutes + delete_space graph = graph_preserve_order | graph_reverse_order delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def get_quantity(deci): numbers = cardinal.graph_hundred_component_at_least_one_none_zero_digit @ ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union("million", "billion", "trillion", "quadrillion", "quintillion", "sextillion") res = (pynutil.insert("integer_part: \"") + numbers + pynutil.insert("\"") + delete_extra_space + pynutil.insert("quantity: \"") + suffix + pynutil.insert("\"")) res |= deci + delete_extra_space + pynutil.insert("quantity: \"") + ( suffix | "thousand") + pynutil.insert("\"") return res
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("1", "eins") graph_zero = pynini.invert( pynini.string_file( get_abs_path("data/numbers/zero.tsv"))).optimize() graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain_common = pynini.string_file( get_abs_path("data/electronic/domain.tsv")) def add_space_after_char(): return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (NEMO_NOT_QUOTE - pynini.accep(" ")) verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA) user_name = pynutil.delete( "username: \"") + add_space_after_char() + pynutil.delete("\"") user_name @= verbalize_characters convert_defaults = pynutil.add_weight( NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common domain = convert_defaults + pynini.closure(insert_space + convert_defaults) domain @= verbalize_characters domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"") protocol = (pynutil.delete("protocol: \"") + add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA) + pynutil.delete("\"")) self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (user_name + pynini.accep(" ") + pynutil.insert("at ") + domain) delete_tokens = self.delete_tokens(self.graph + delete_preserve_order) self.fst = delete_tokens.optimize()
def prepare_labels_for_insertion(file_path: str): """ Read the file and creates a union insertion graph Args: file_path: path to a file (single column) Returns fst that inserts labels from the file """ labels = load_labels(file_path) mapping = defaultdict(list) for k, v in labels: mapping[k].append(v) for k in mapping: mapping[k] = insert_space + pynini.union( *[pynutil.insert(end) for end in mapping[k]]) return mapping
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynini.closure(pynutil.delete("+"), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) area_part_common = pynutil.add_weight( pynini.cross("800", "eight hundred"), -1.1) area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = area_part_default | area_part_common area_part = ( (area_part + pynutil.delete("-")) | (pynutil.delete("(") + area_part + (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator del_separator = pynini.closure(pynini.union("-", " "), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator))**7 number_words = pynini.closure((NEMO_DIGIT @ digit) + (insert_space | pynini.cross("-", ', ')) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' '))) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure(insert_space + extension, 0, 1) graph = optional_country_code + number_part + optional_extension final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def _get_year_graph(): """ Transducer for year, e.g. hai không hai mươi -> 2020 """ def _get_digits_graph(): zero = pynini.cross((pynini.union("linh", "lẻ")), "0") four = pynini.cross("tư", "4") graph = pynini.union( zero + delete_space + (graph_digit | four), graph_zero + delete_space + graph_digit, ) graph.optimize() return graph def _get_hundreds_graph(graph_ties, graph_digits): graph = (graph_digit + delete_space + pynutil.delete("trăm") + delete_space + (graph_teen | graph_ties | graph_digits)) return graph def _get_thousands_graph(graph_ties, graph_digits): graph_hundred_component = ( (graph_digit | graph_zero) + delete_space + pynutil.delete("trăm")) | pynutil.insert("0") graph = (graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) + delete_space + graph_hundred_component + delete_space + (graph_teen | graph_ties | graph_digits)) return graph graph_ties = _get_ties_graph() graph_digits = _get_digits_graph() graph_hundreds = _get_hundreds_graph(graph_ties, graph_digits) graph_thousands = _get_thousands_graph(graph_ties, graph_digits) year_graph = ( # 20 19, 40 12, 2012, 2 0 0 5, 2 0 17, 938 - assuming no limit on the year graph_digit + delete_space + (graph_digit | graph_zero) + delete_space + (graph_teen | graph_ties | graph_digits) | graph_thousands | graph_hundreds | (graph_digit + pynutil.insert("0") + delete_space + (graph_ties | graph_digits | graph_teen))) year_graph.optimize() return year_graph
def __init__( self, input_case: str, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") word_graph = WordFst(deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = pynutil.add_weight( whitelist_graph, 1) | pynutil.add_weight(word_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, ordinal: GraphFst): super().__init__(name="date", kind="classify") ordinal_graph = ordinal.graph # weekday, day, month, year, style(depr), text(depr), short_year(depr), era year_graph = _get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") day_graph = pynutil.insert("day: \"") + pynutil.add_weight( ordinal_graph, -0.7) + pynutil.insert("\"") optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) optional_graph_year = pynini.closure( delete_extra_space + pynutil.insert("year: \"") + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\""), 0, 1, ) graph_mdy = month_graph + optional_day_graph + optional_graph_year graph_dmy = (pynutil.delete("the") + delete_space + day_graph + delete_space + pynutil.delete("of") + delete_extra_space + month_graph + optional_graph_year) graph_year = pynutil.insert("year: \"") + ( year_graph | _get_range_graph()) + pynutil.insert("\"") final_graph = graph_mdy | graph_dmy | graph_year final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_four_digit_year_graph(deterministic: bool = True): """ Returns a four digit transducer which is combination of ties/teen or digits (using hundred instead of thousand format), e.g. 1219 -> twelve nineteen 3900 -> thirty nine hundred """ graph_ties = get_ties_graph(deterministic) graph_with_s = ( (graph_ties + insert_space + graph_ties) | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten"))) ) + pynutil.delete("0s") graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s") graph_with_s = graph_with_s @ pynini.cdrewrite( pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA ) graph = graph_ties + insert_space + graph_ties graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") thousand_graph = ( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit) ) thousand_graph |= ( graph_digit + insert_space + pynini.cross("000", "thousand") + pynini.closure(pynutil.delete(" "), 0, 1) + pynini.accep("s") ) graph |= graph_with_s if deterministic: graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA) else: graph |= thousand_graph return graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))) symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("dot", ".") username = (pynutil.insert("username: \"") + alpha_num + pynini.closure(delete_extra_space + accepted_username) + pynutil.insert("\"")) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain_graph = (pynutil.insert("domain: \"") + server + delete_extra_space + process_dot + delete_extra_space + domain + pynutil.insert("\"")) graph = username + delete_extra_space + pynutil.delete( "at") + insert_space + delete_extra_space + domain_graph ############# url ### protocol_end = pynini.cross(pynini.union("w w w", "www"), "www") protocol_start = (pynini.cross("h t t p", "http") | pynini.cross( "h t t p s", "https")) + pynini.cross(" colon slash slash ", "://") # .com, ending = (delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space, ) + accepted_username)) protocol = (pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + pynini.closure(delete_extra_space + accepted_username, 1) + pynini.closure(ending, 1)) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. một triệu -> integer_part: "1" quantity: "triệu" e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn") graph_four = pynini.cross("tư", "4") graph_one = pynini.cross("mốt", "1") graph_half = pynini.cross("rưỡi", "5") last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input") last_digit = pynini.union( (pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit, graph_one, graph_four, graph_half, ) optional_fraction_graph = pynini.closure( delete_extra_space + pynutil.insert('fractional_part: "') + (last_digit | graph_half | graph_one | graph_four) + pynutil.insert('"'), 0, 1, ) res = (pynutil.insert('integer_part: "') + numbers + pynutil.insert('"') + delete_extra_space + pynutil.insert('quantity: "') + suffix + pynutil.insert('"') + optional_fraction_graph) res |= (decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "ngàn" | "nghìn") + pynutil.insert('"')) return res
def __init__(self, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) day_cardinal = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") day = strip_cardinal_apocope(day_cardinal) primero = pynini.cdrewrite(pynini.cross("uno", "primero"), "[BOS]", "[EOS]", NEMO_SIGMA) day = ( (day @ primero) if deterministic else pynini.union(day, day @ primero) ) # Primero for first day is traditional, but will vary depending on region month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") year = ( pynutil.delete("year: \"") + articles + NEMO_SPACE + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) # Insert preposition if wasn't originally with the year. This would mean a space was present year = pynutil.add_weight(year, -0.001) year |= ( pynutil.delete("year: \"") + pynutil.insert("de ") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) # day month year graph_dmy = day + pynini.cross(NEMO_SPACE, " de ") + month + pynini.closure(pynini.accep(" ") + year, 0, 1) graph_mdy = month + NEMO_SPACE + day + pynini.closure(NEMO_SPACE + year, 0, 1) if deterministic: graph_mdy += pynutil.delete(" preserve_order: true") # Only accepts this if was explicitly passed self.graph = graph_dmy | graph_mdy final_graph = self.graph + delete_preserve_order delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_decimal = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_decimal |= pynini.string_file( get_abs_path("data/numbers/zero.tsv")) graph_decimal = ( pynini.cross("zero", "0") | graph_decimal | (graph_decimal | pynini.cross("o", "0")) + pynini.closure( delete_space + (graph_decimal | pynini.cross("o", "0")), 1)) self.graph = pynini.invert(graph_decimal).optimize() if not deterministic: self.graph = self.graph | cardinal_graph point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + graph_fractional) self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit) final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__( self, itn_cardinal_tagger: GraphFst, tn_date_tagger: GraphFst, tn_date_verbalizer: GraphFst, deterministic: bool = True, ): super().__init__(name="date", kind="classify", deterministic=deterministic) add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ", weight=0.0001)) tagger = tn_date_verbalizer.graph.invert().optimize() delete_day_marker = ( pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ itn_cardinal_tagger.graph_no_exception month_as_number = pynutil.delete("month: \"") + itn_cardinal_tagger.graph_no_exception + pynutil.delete("\"") month_as_string = pynutil.delete("month: \"") + tn_date_tagger.month_abbr.invert() + pynutil.delete("\"") convert_year = (tn_date_tagger.year @ optional_delete_space).invert().optimize() delete_year_marker = ( pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ convert_year # day. month as string (year) verbalizer = ( pynini.closure(delete_day_marker + pynutil.insert(".") + pynini.accep(" "), 0, 1) + month_as_string + pynini.closure(pynini.accep(" ") + delete_year_marker, 0, 1) ) # day. month as number (year) verbalizer |= ( delete_day_marker @ add_leading_zero_to_double_digit + pynutil.insert(".") + pynutil.delete(" ") + month_as_number @ add_leading_zero_to_double_digit + pynutil.insert(".") + pynini.closure(pynutil.delete(" ") + delete_year_marker, 0, 1) ) # year verbalizer |= delete_year_marker final_graph = tagger @ verbalizer graph = pynutil.insert("name: \"") + convert_space(final_graph) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self): super().__init__(name="fraction", kind="verbalize") optional_sign = pynini.closure( pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) integer = (pynutil.delete("integer_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + insert_space) numerator = pynutil.delete("numerator: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") denominator = (pynutil.insert('/') + pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) graph = (pynini.closure(integer + delete_space, 0, 1) + numerator + delete_space + denominator).optimize() self.numbers = graph delete_tokens = self.delete_tokens(optional_sign + graph) self.fst = delete_tokens.optimize()
def singular_to_plural(): # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n") _en = ( NEMO_SIGMA + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung") + pynutil.insert("en") ) _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e") | pynutil.insert("nen")) _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en") # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör _e = NEMO_SIGMA + pynini.union("eur", "ich", "ier", "ig", "ling", "ör") + pynutil.insert("e") _s = NEMO_SIGMA + pynini.union("a", "i", "o", "u", "y") + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, pynini.union(_n, _en, _nen, _fremd, _e, _s), NEMO_SIGMA ).optimize() return graph_plural
def get_hundreds_graph(deterministic: bool = True): """ Returns a four digit transducer which is combination of ties/teen or digits (using hundred instead of thousand format), e.g. 1219 -> twelve nineteen 3900 -> thirty nine hundred """ graph_ties = get_ties_graph(deterministic) graph = (graph_ties + insert_space + graph_ties | graph_teen + insert_space + pynini.cross("00", "hundred") | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s")) @ pynini.cdrewrite( pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA) | pynutil.add_weight( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit), weight=-0.001, )) return graph
def __init__(self, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="verbalize", deterministic=deterministic) keep_space = pynini.accep(" ") maj = pynutil.delete("currency_maj: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") min = pynutil.delete("currency_min: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") fractional_part = (pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) integer_part = decimal.integer # *** currency_maj graph_integer = integer_part + keep_space + maj # *** currency_maj + (***) | ((and) *** current_min) fractional = fractional_part + delete_extra_space + min if not deterministic: fractional |= pynutil.insert("and ") + fractional graph_integer_with_minor = integer_part + keep_space + maj + keep_space + fractional + delete_preserve_order # *** point *** currency_maj graph_decimal = decimal.numbers + keep_space + maj # *** current_min graph_minor = fractional_part + delete_extra_space + min + delete_preserve_order graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor if not deterministic: graph |= graph_integer + delete_preserve_order delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="verbalize", deterministic=deterministic) keep_space = pynini.accep(" ") maj = pynutil.delete("currency_maj: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") min = pynutil.delete("currency_min: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") fractional_part = (pynutil.delete("fractional_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) integer_part = pynutil.delete("integer_part: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") optional_add_and = pynini.closure(pynutil.insert("und "), 0, 1) # *** currency_maj graph_integer = integer_part + keep_space + maj # *** currency_maj + (***) | ((und) *** current_min) graph_integer_with_minor = ( integer_part + keep_space + maj + keep_space + (fractional_part | (optional_add_and + fractional_part + keep_space + min)) + delete_preserve_order) # *** komma *** currency_maj graph_decimal = decimal.fst + keep_space + maj # *** current_min graph_minor = fractional_part + keep_space + min + delete_preserve_order graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def prepare_labels_for_insertion(file_path: str): """ Read the file and creates a union insertion graph Args: file_path: path to a file (3 columns: a label type e.g. "@@decimal_delimiter@@", a label e.g. "целого", and a weight e.g. "0.1"). Returns dictionary mapping from label type to an fst that inserts the labels with the specified weights. """ labels = load_labels(file_path) mapping = defaultdict(list) for k, v, w in labels: mapping[k].append((v, w)) for k in mapping: mapping[k] = (insert_space + pynini.union(*[ pynutil.add_weight(pynutil.insert(end), weight) for end, weight in mapping[k] ])).optimize() return mapping
def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) optional_sign = cardinal.optional_sign unit = pynutil.insert(" ") + pynini.closure(NEMO_CHAR - " ", 1) unit = pynutil.delete("units: \"") + unit + pynutil.delete( "\"") + delete_space graph_decimal = (pynutil.delete("decimal {") + delete_space + optional_sign + delete_space + decimal.numbers + delete_space + pynutil.delete("}")) self.graph_cardinal = (pynutil.delete("cardinal {") + delete_space + optional_sign + delete_space + cardinal.numbers + delete_space + pynutil.delete("}")) graph = (self.graph_cardinal | graph_decimal) + delete_space + unit delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, input_case: str): super().__init__(name="tokenize_and_classify", kind="classify") cardinal = CardinalFst() cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal) decimal_graph = decimal.fst measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst date_graph = DateFst(cardinal=cardinal).fst word_graph = WordFst().fst time_graph = TimeFst(cardinal=cardinal).fst telephone_graph = TelephoneFst().fst electonic_graph = ElectronicFst().fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst whitelist_graph = WhiteListFst(input_case=input_case).fst punct_graph = PunctuationFst().fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(word_graph, 100)).optimize() punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") # masculne gender we leave as is graph_masc = graph + pynutil.delete(" morphosyntactic_features: \"gender_masc") # shift gender graph_fem_ending = graph @ pynini.cdrewrite( pynini.cross("o", "a"), "", NEMO_SPACE | pynini.accep("[EOS]"), NEMO_SIGMA ) graph_fem = shift_number_gender(graph_fem_ending) + pynutil.delete(" morphosyntactic_features: \"gender_fem") # Apocope just changes tercero and primero. May occur if someone wrote 11.er (uncommon) graph_apocope = ( pynini.cross("tercero", "tercer") | pynini.cross("primero", "primer") | pynini.cross("undécimo", "decimoprimer") ) # In case someone wrote 11.er with deterministic graph_apocope = (graph @ pynini.cdrewrite(graph_apocope, "", "", NEMO_SIGMA)) + pynutil.delete( " morphosyntactic_features: \"apocope" ) graph = graph_apocope | graph_masc | graph_fem if not deterministic: # Plural graph graph_plural = pynini.cdrewrite( pynutil.insert("s"), pynini.union("o", "a"), NEMO_SPACE | pynini.accep("[EOS]"), NEMO_SIGMA ) graph |= (graph @ graph_plural) + pynutil.delete("/plural") self.graph = graph + pynutil.delete("\"") delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self): super().__init__(name="electronic", kind="verbalize") graph_digit = pynini.invert( pynini.string_file( get_abs_path("data/numbers/digit.tsv"))).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight(pynini.closure(pynini.cross(".", "dot ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "dot ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = pynini.closure(NEMO_NOT_QUOTE + insert_space) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "dot ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = user_name + delete_space + pynutil.insert( "at ") + delete_space + domain + delete_space delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") self.cardinal = cardinal.graph_no_exception year_graph = self.cardinal month_graph = pynini.string_file(get_abs_path("data/months.tsv")) month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"") optional_graph_year = pynini.closure( delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year final_graph = graph_dmy final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="verbalize") hour = (pynutil.delete("hours:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) minute = (pynutil.delete("minutes:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) suffix = (pynutil.delete("suffix:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) optional_suffix = pynini.closure(delete_space + insert_space + suffix, 0, 1) zone = (pynutil.delete("zone:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1) graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone graph |= hour + insert_space + pynutil.insert( "o'clock") + optional_zone graph |= hour + delete_space + insert_space + suffix + optional_zone delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, number_names: dict, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) separator = pynini.cross("-", " ") # between components number = number_names["cardinal_names_nominative"] country_code = ( pynutil.insert("country_code: \"") + pynini.closure(pynutil.add_weight(pynutil.delete("+"), 0.1), 0, 1) + number + separator + pynutil.insert("\"") ) optional_country_code = pynini.closure(country_code + insert_space, 0, 1) number_part = ( NEMO_DIGIT ** 3 @ number + separator + NEMO_DIGIT ** 3 @ number + separator + NEMO_DIGIT ** 2 @ number + separator + NEMO_DIGIT ** 2 @ (pynini.closure(pynini.cross("0", "ноль ")) + number) ) number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") tagger_graph = (optional_country_code + number_part).optimize() # verbalizer verbalizer_graph = pynini.closure( pynutil.delete("country_code: \"") + pynini.closure(RU_ALPHA_OR_SPACE, 1) + pynutil.delete("\"") + delete_space, 0, 1, ) verbalizer_graph += ( pynutil.delete("number_part: \"") + pynini.closure(RU_ALPHA_OR_SPACE, 1) + pynutil.delete("\"") ) verbalizer_graph = verbalizer_graph.optimize() self.final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens( pynutil.insert("number_part: \"") + self.final_graph + pynutil.insert("\"") ).optimize()