def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = cardinal.graph labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_graph_second = ( pynutil.insert("seconds: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 10:30:05 pm, graph_hms = (final_graph_hour + pynutil.delete(":") + (pynini.cross("00", " minutes: \"zero\"") | insert_space + final_graph_minute) + pynutil.delete(":") + (pynini.cross("00", " seconds: \"zero\"") | insert_space + final_graph_second) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
("Ё́", "Е'"), ("И́", "И'"), ("О́", "О'"), ("У́", "У'"), ("Ы́", "Ы'"), ("Э́", "Э'"), ("Ю́", "Ю'"), ("Я́", "Я'"), ("а́", "а'"), ("е́", "е'"), ("ё́", "е'"), ("и́", "и'"), ("о́", "о'"), ("у́", "у'"), ("ы́", "ы'"), ("э́", "э'"), ("ю́", "ю'"), ("я́", "я'"), ("ё", "е"), ("Ё", "Е"), ] REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize() TO_LATIN = pynini.string_file(get_abs_path("data/cyrillic_to_latin.tsv")) RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, NEMO_SPACE, NEMO_NON_BREAKING_SPACE).optimize() except (ModuleNotFoundError, ImportError): # Create placeholders RU_ALPHA = None LO_LATIN = None
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file( get_abs_path("data/time/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) time_to_graph = pynini.string_file( get_abs_path("data/time/time_to.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.insert("00") | oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) graph_quarter_time = (pynutil.insert("minutes: \"") + pynini.cross("quarter", "45") + pynutil.insert("\"") + delete_space + pynutil.delete(pynini.union("to", "till")) + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"")) final_graph = ((graph_hm | graph_mh | graph_quarter_time) + final_suffix_optional + final_time_zone_optional).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = u"\u00A0" NEMO_SPACE = " " NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") delete_preserve_order = pynini.closure( pynutil.delete(" preserve_order: true") | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))) suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x",
def __init__( self, input_case: str, deterministic: bool = False, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) if deterministic: raise ValueError( 'Ru TN only supports non-deterministic cases and produces multiple normalization options.' ) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_ru_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars. This might take some time...") number_names = get_number_names() alternative_formats = get_alternative_formats() self.cardinal = CardinalFst( number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic ) cardinal_graph = self.cardinal.fst self.ordinal = OrdinalFst( number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic ) ordinal_graph = self.ordinal.fst self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) measure_graph = self.measure.fst self.date = DateFst(number_names=number_names, deterministic=deterministic) date_graph = self.date.fst word_graph = WordFst(deterministic=deterministic).fst self.time = TimeFst(number_names=number_names, deterministic=deterministic) time_graph = self.time.fst self.telephone = TelephoneFst(number_names=number_names, deterministic=deterministic) telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 0.9) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundred = pynini.cross("hundred", "") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit ) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) graph_trillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) graph_quadrillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) graph_quintillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) graph_sextillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ graph self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def lg_containing_str(x, i): return (sigmaStar + pynini.closure(b, i, i) + sigmaStar).minimize()
def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': """ Function to rewrite cardinals written in traditional orthograph (no '-' for numbers >100) to current orthography ('-' between all words in number string) e.g. deux mille cent vingt-trois -> deux-mille-cent-vingt-trois. In cases where original orthography is current, or string is mixture of two orthographies, will render invalid form that will not pass through CardinalFst e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.) e.g. deux Args: cardinal: cardinal FST """ # Traditional orthography does not hyphenate numbers > 100, this will insert hyphens in # those contexts. targets = pynini.string_map([ "et", # for 'et un/onze' "cent", "mille", "million", "milliard", "billion", "billiard", "trillion", "trilliard", ]) targets += pynini.accep("s").ques no_spaces = pynini.closure(NEMO_NOT_SPACE) # Valid numbers in reformed orthography will have no spaces. new_orthography_sigma = no_spaces # Old orthography will not have these strings. Replacing with character to mark. targets_for_filtering = ("-" + targets) | ("-" + targets + "-") | (targets + "-") filter = pynini.cdrewrite(pynini.cross(targets_for_filtering, "#"), "", "", NEMO_SIGMA) # Invalid for cardinal old_orthography_sigma = pynini.difference( NEMO_CHAR, "#") # Marked character removed from sigma_star. old_orthography_sigma.closure() # Only accept strings that occur in old orthography. (This avoids tying two non-related numbers together.) # e.g. mille cent-une -> mille-cent-une filter @= old_orthography_sigma # Now know replacements will only work around targets replace_left = pynini.cdrewrite(pynini.cross(" ", "-"), "", targets, NEMO_SIGMA) replace_right = pynini.cdrewrite(pynini.cross(" ", "-"), targets, "", NEMO_SIGMA) replace = replace_left @ replace_right graph = new_orthography_sigma | (filter @ replace) return graph @ cardinal
def __init__(self, number_names: dict, deterministic: bool): super().__init__(name="date", kind="classify", deterministic=deterministic) # Ru format: DD-MM-YYYY or DD-MM-YY month_abbr_to_names = pynini.string_file( get_abs_path("data/months/abbr_to_name.tsv")).optimize() delete_sep = pynutil.add_weight(pynini.cross( ".", " "), 1.09) | pynutil.add_weight( pynini.cross(pynini.union("/", "-"), " "), 1.1) numbers = number_names['ordinal_number_names'] zero = (pynutil.add_weight(pynini.cross("0", ""), -0.1)) | (pynutil.add_weight( pynini.cross("0", "ноль "), 0.1)) zero_digit = zero + pynini.compose(NEMO_DIGIT, numbers) digit_day = (pynini.union("1", "2", "3") + NEMO_DIGIT) | NEMO_DIGIT digit_day = pynini.compose(digit_day, numbers) day = (pynutil.insert("day: \"") + (zero_digit | digit_day) + pynutil.insert("\"")).optimize() digit_month = zero_digit | pynini.compose( pynini.accep("1") + NEMO_DIGIT, numbers) month_number_to_abbr = pynini.string_file( get_abs_path("data/months/numbers.tsv")).optimize() month_number_to_abbr = ((( (pynutil.add_weight(pynini.cross("0", ""), -0.1) | pynini.accep("1")) + NEMO_DIGIT) | NEMO_DIGIT).optimize() @ month_number_to_abbr).optimize() month_name = ((month_number_to_abbr @ month_abbr_to_names) | pynutil.add_weight(month_abbr_to_names, 0.1)).optimize() month = (pynutil.insert("month: \"") + (month_name | digit_month) + pynutil.insert("\"")).optimize() year = pynini.compose(((NEMO_DIGIT**4) | (NEMO_DIGIT**2)), numbers).optimize() year |= zero_digit year_word_singular = ["год", "года", "году", "годом", "годе"] year_word_plural = [ "годы", "годов", "годам", "годами", "годам", "годах" ] year_word = pynini.cross("г.", pynini.union(*year_word_singular)) year_word |= pynini.cross("гг.", pynini.union(*year_word_plural)) year_word = (pynutil.add_weight(insert_space, -0.1) | pynutil.add_weight(pynini.accep(" "), 0.1)) + year_word year_optional = pynutil.insert("year: \"") + year + pynini.closure( year_word, 0, 1) + pynutil.insert("\"") year_optional = pynini.closure(delete_sep + year_optional, 0, 1).optimize() year_only = pynutil.insert( "year: \"") + year + year_word + pynutil.insert("\"") tagger_graph = (day + delete_sep + month + year_optional) | year_only # Verbalizer day = (pynutil.delete("day:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) month = (pynutil.delete("month:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space + pynutil.delete("\"")) year_optional = pynini.closure(delete_extra_space + year, 0, 1) graph_dmy = day + delete_extra_space + month + year_optional verbalizer_graph = (graph_dmy | year) + delete_space self.final_graph = pynini.compose(tagger_graph, verbalizer_graph).optimize() self.fst = pynutil.insert( "day: \"") + self.final_graph + pynutil.insert("\"") self.fst = self.add_tokens(self.fst).optimize()
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = u"\u00A0" NEMO_SPACE = " " NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n") _en = (NEMO_SIGMA + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung") + pynutil.insert("en")) _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e") | pynutil.insert("nen")) _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en") # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_ties_unique = pynini.string_file( get_abs_path("data/numbers/ties_unique.tsv")) # Tens components graph_tens_component = graph_ties + ( (delete_hyphen + graph_digit) | pynutil.insert("0")) graph_tens_component = pynini.union(graph_tens_component, graph_teens, graph_ties_unique) graph_tens_component_with_leading_zeros = pynini.union( graph_tens_component, (pynutil.insert("0") + (graph_digit | pynutil.insert("0", weight=0.01)))) # Hundreds components graph_cent_singular = pynutil.delete("cent") # Used in hundreds place graph_cent_plural = pynini.cross( "cents", "00" ) # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201 graph_digit_no_one = pynini.project(pynini.union("un", "une"), 'input') graph_digit_no_one = (pynini.project(graph_digit, "input") - graph_digit_no_one.arcsort()) @ graph_digit graph_hundreds_component_singular = ( graph_digit_no_one + delete_hyphen + graph_cent_singular ) # Regular way: [1-9] * 100 graph_hundreds_component_singular = pynini.union( graph_hundreds_component_singular, pynini.cross("cent", "1")) graph_hundreds_component_singular += delete_hyphen graph_hundreds_component_singular += graph_tens_component_with_leading_zeros graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural graph_hundreds_component = pynini.union( graph_hundreds_component_singular, graph_hundreds_component_plural, pynutil.insert("0") + graph_tens_component_with_leading_zeros, ) graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite( graph_hundreds_component_at_least_one_none_zero_digit).optimize() # Graph thousands (we'll need this for cases of mille millions, mille milliards...) graph_tens_of_hundreds_component_singular = ( graph_tens_component + delete_hyphen + graph_cent_singular ) # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents" graph_tens_of_hundreds_component_singular += delete_hyphen + graph_tens_component_with_leading_zeros graph_tens_of_hundreds_component_plural = graph_tens_component + delete_hyphen + graph_cent_plural graph_tens_of_hundred_component = ( graph_tens_of_hundreds_component_plural | graph_tens_of_hundreds_component_singular) graph_thousands = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + pynutil.delete("mille"), pynutil.insert("001") + pynutil.delete("mille"), # because 'mille', not 'un mille' pynutil.insert("000", weight=0.1), ) # All other large amounts graph_millions = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("million") | pynutil.delete("millions")), pynutil.insert("000", weight=0.1), ) graph_milliards = pynini.union( # French for English 'billion' graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("milliard") | pynutil.delete("milliards")), pynutil.insert("000", weight=0.1), ) graph_billions = pynini.union( # NOTE: this is English 'trillion.' graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("billions") | pynutil.delete("billion")), pynutil.insert("000", weight=0.1), ) graph_mille_billion = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + pynutil.delete("mille"), pynutil.insert("001") + pynutil.delete("mille"), # because we say 'mille', not 'un mille' ) graph_mille_billion += delete_hyphen + ( graph_millions | pynutil.insert("000") + pynutil.delete("billions") ) # allow for 'mil millones' graph_mille_billion |= pynutil.insert("000000", weight=0.1) graph_billiards = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("billiards") | pynutil.delete("billiard")), pynutil.insert("000", weight=0.1), ) graph_trillions = pynini.union( # One thousand English trillions. graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("trillions") | pynutil.delete("trillion")), pynutil.insert("000", weight=0.1), ) graph_trilliards = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("trilliards") | pynutil.delete("trilliard")), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_trilliards + delete_hyphen + graph_trillions + delete_hyphen + graph_billiards + delete_hyphen + graph_billions + delete_hyphen + graph_milliards + delete_hyphen + graph_millions + delete_hyphen + graph_thousands + delete_hyphen + graph_hundreds_component, graph_tens_of_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0") graph = rewrite(graph) self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose( graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = (NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6)) numbers_up_to_million = pynini.compose( graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input') self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("moins", "\"-\"") + NEMO_SPACE, 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) # tagger accepted_symbols = [] with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) username = (pynutil.insert("username: \"") + NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) + pynutil.insert("\"") + pynini.cross('@', ' ')) domain_graph = ( NEMO_ALPHA + (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-') | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT)) domain_graph = pynutil.insert( "domain: \"") + domain_graph + pynutil.insert("\"") tagger_graph = (username + domain_graph).optimize() # verbalizer graph_digit = pynini.string_file( get_abs_path( "data/numbers/digits_nominative_case.tsv")).optimize() graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "точка ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = (pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1) + pynini.closure(graph_symbols + insert_space) + pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1)) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "точка ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = user_name + delete_space + pynutil.insert( "собака ") + delete_space + domain + delete_space # replace all latin letters with their Ru verbalization verbalizer_graph = (graph.optimize() @ (pynini.closure( TO_CYRILLIC | RU_ALPHA | pynini.accep(" ")))).optimize() verbalizer_graph = verbalizer_graph.optimize() self.final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens( pynutil.insert("username: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="telephone", kind="classify") # country code, number_part, extension digit_to_str = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero")) str_to_digit = pynini.invert(digit_to_str) double_digit = pynini.union(*[ pynini.cross( pynini.project(str(i) @ digit_to_str, "output") + pynini.accep(" ") + pynini.project(str(i) @ digit_to_str, "output"), pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"), ) for i in range(10) ]) double_digit.invert() # to handle cases like "one twenty three" two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT**2) double_digit_to_digit = (pynini.compose( double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal) single_or_double_digit = (double_digit_to_digit | str_to_digit).optimize() single_or_double_digit = ( single_or_double_digit + pynini.closure(pynutil.delete(" ") + single_or_double_digit)).optimize() number_part = pynini.compose( single_or_double_digit, NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() number_part = pynutil.insert( "number_part: \"") + number_part.optimize() + pynutil.insert("\"") cardinal_option = pynini.compose(single_or_double_digit, NEMO_DIGIT**(2, 3)) country_code = ( pynutil.insert("country_code: \"") + pynini.closure(pynini.cross("plus ", "+"), 0, 1) + ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit) | cardinal_option) + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize() graph = optional_country_code + number_part # credit card number space_four_digits = insert_space + NEMO_DIGIT**4 credit_card_graph = pynini.compose( single_or_double_digit, NEMO_DIGIT**4 + space_four_digits**3).optimize() graph |= pynutil.insert( "number_part: \"") + credit_card_graph.optimize() + pynutil.insert( "\"") # SSN ssn_graph = pynini.compose( single_or_double_digit, NEMO_DIGIT**3 + pynutil.insert("-") + NEMO_DIGIT**2 + pynutil.insert("-") + NEMO_DIGIT**4, ).optimize() graph |= pynutil.insert( "number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") # ip digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit digit_or_double |= double_digit_to_digit + pynini.closure( pynutil.delete(" ") + str_to_digit, 0, 1) digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit)**(0, 2) digit_or_double |= cardinal_option digit_or_double = digit_or_double.optimize() ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double)**3 graph |= pynutil.insert( "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, time: GraphFst, date: GraphFst, cardinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="range", kind="classify", deterministic=deterministic) delete_space = pynini.closure(pynutil.delete(" "), 0, 1) cardinal = cardinal.graph_with_and approx = pynini.cross("~", "approximately") + delete_extra_space # TIME time_graph = time + delete_space + pynini.cross( "-", " to ") + delete_space + time self.graph = time_graph | (approx + time) # YEAR date_year_four_digit = (NEMO_DIGIT**4 + pynini.closure(pynini.accep("s"), 0, 1)) @ date date_year_two_digit = (NEMO_DIGIT**2 + pynini.closure(pynini.accep("s"), 0, 1)) @ date year_to_year_graph = (date_year_four_digit + delete_space + pynini.cross("-", " to ") + delete_space + (date_year_four_digit | date_year_two_digit | (NEMO_DIGIT**2 @ cardinal))) self.graph |= year_to_year_graph # ADDITION range_graph = cardinal + pynini.closure( pynini.cross("+", " plus ") + cardinal, 1) range_graph |= cardinal + pynini.closure( pynini.cross(" + ", " plus ") + cardinal, 1) range_graph |= approx + cardinal if not deterministic or lm: # cardinal ---- cardinal_to_cardinal_graph = ( cardinal + delete_space + pynini.cross("-", pynini.union(" to ", " minus ")) + delete_space + cardinal) range_graph |= cardinal_to_cardinal_graph | ( cardinal + delete_space + pynini.cross(":", " to ") + delete_space + cardinal) # MULTIPLY for x in [" x ", "x"]: range_graph |= cardinal + pynini.closure( pynini.cross(x, pynini.union(" by ", " times ")) + cardinal, 1) for x in ["*", " * "]: range_graph |= cardinal + pynini.closure( pynini.cross(x, " times ") + cardinal, 1) # supports "No. 12" -> "Number 12" range_graph |= ((pynini.cross(pynini.union("NO", "No"), "Number") | pynini.cross("no", "number")) + pynini.closure(pynini.union(". ", " "), 0, 1) + cardinal) for x in ["/", " / "]: range_graph |= cardinal + pynini.closure( pynini.cross(x, " divided by ") + cardinal, 1) self.graph |= range_graph self.graph = self.graph.optimize() graph = pynutil.insert("name: \"") + convert_space( self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self): super().__init__(name="telephone", kind="verbalize") number_part = pynutil.delete('number_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') delete_tokens = self.delete_tokens(number_part) self.fst = delete_tokens.optimize()
def __init__(self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electonic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100)) if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify |= pynutil.add_weight(roman_graph, 100) abbreviation_graph = AbbreviationFst( deterministic=deterministic).fst classify |= pynutil.add_weight(abbreviation_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) # Derivational strings append 'avo' as a suffix. Adding space for processing aid fraction_stem = pynutil.insert(" avo") plural = pynutil.insert("s") conjunction = pynutil.insert(" y ") integer = (pynutil.delete("integer_part: \"") + strip_cardinal_apocope(pynini.closure(NEMO_NOT_QUOTE)) + pynutil.delete("\"")) numerator_one = pynutil.delete("numerator: \"") + pynini.accep( "un") + pynutil.delete("\" ") numerator = (pynutil.delete("numerator: \"") + pynini.difference(pynini.closure(NEMO_NOT_QUOTE), "un") + pynutil.delete("\" ")) denominator_add_stem = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + fraction_stem + pynutil.delete("\" morphosyntactic_features: \"add_root\"")) denominator_ordinal = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" morphosyntactic_features: \"ordinal\"")) denominator_cardinal = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")) denominator_singular = pynini.union(denominator_add_stem, denominator_ordinal) if not deterministic: # Occasional exceptions denominator_singular |= denominator_add_stem @ pynini.string_map( [("once avo", "undécimo"), ("doce avo", "duodécimo")]) denominator_plural = denominator_singular + plural # Merging operations merge = pynini.cdrewrite( pynini.cross(" y ", "i"), "", "", NEMO_SIGMA ) # The denominator must be a single word, with the conjunction "y" replaced by i merge @= pynini.cdrewrite(delete_space, "", pynini.difference(NEMO_CHAR, "parte"), NEMO_SIGMA) # The merger can produce duplicate vowels. This is not allowed in orthography delete_duplicates = pynini.string_map([("aa", "a"), ("oo", "o")]) # Removes vowels delete_duplicates = pynini.cdrewrite(delete_duplicates, "", "", NEMO_SIGMA) remove_accents = pynini.cdrewrite( accents, pynini.union(NEMO_SPACE, pynini.accep("[BOS]")) + pynini.closure(NEMO_NOT_SPACE), pynini.closure(NEMO_NOT_SPACE) + pynini.union("avo", "ava", "ésimo", "ésima"), NEMO_SIGMA, ) merge_into_single_word = merge @ remove_accents @ delete_duplicates fraction_default = numerator + delete_space + insert_space + ( denominator_plural @ merge_into_single_word) fraction_with_one = (numerator_one + delete_space + insert_space + (denominator_singular @ merge_into_single_word)) fraction_with_cardinal = strip_cardinal_apocope(numerator | numerator_one) fraction_with_cardinal += ( delete_space + pynutil.insert(" sobre ") + strip_cardinal_apocope(denominator_cardinal)) if not deterministic: # There is an alternative rendering where ordinals act as adjectives for 'parte'. This requires use of the feminine # Other rules will manage use of "un" at end, so just worry about endings exceptions = pynini.string_map([("tercia", "tercera")]) apply_exceptions = pynini.cdrewrite(exceptions, "", "", NEMO_SIGMA) vowel_change = pynini.cdrewrite(pynini.cross("o", "a"), "", pynini.accep("[EOS]"), NEMO_SIGMA) denominator_singular_fem = shift_cardinal_gender( denominator_singular) @ vowel_change @ apply_exceptions denominator_plural_fem = denominator_singular_fem + plural numerator_one_fem = shift_cardinal_gender(numerator_one) numerator_fem = shift_cardinal_gender(numerator) fraction_with_cardinal |= ( (numerator_one_fem | numerator_fem) + delete_space + pynutil.insert(" sobre ") + shift_cardinal_gender(denominator_cardinal)) # Still need to manage stems merge_stem = pynini.cdrewrite( delete_space, "", pynini.union("avo", "ava", "avos", "avas"), NEMO_SIGMA) # For managing alternative spacing merge_stem @= remove_accents @ delete_duplicates fraction_with_one_fem = numerator_one_fem + delete_space + insert_space fraction_with_one_fem += pynini.union( denominator_singular_fem @ merge_stem, denominator_singular_fem @ merge_into_single_word) # Both forms exists fraction_with_one_fem += pynutil.insert(" parte") fraction_with_one_fem @= pynini.cdrewrite( pynini.cross("una media", "media"), "", "", NEMO_SIGMA) # "media" not "una media" fraction_default_fem = numerator_fem + delete_space + insert_space fraction_default_fem += pynini.union( denominator_plural_fem @ merge_stem, denominator_plural_fem @ merge_into_single_word) fraction_default_fem += pynutil.insert(" partes") fraction_default |= (numerator + delete_space + insert_space + denominator_plural @ merge_stem ) # Case of no merger fraction_default |= fraction_default_fem fraction_with_one |= numerator_one + delete_space + insert_space + denominator_singular @ merge_stem fraction_with_one |= fraction_with_one_fem fraction_with_one @= pynini.cdrewrite(pynini.cross( "un medio", "medio"), "", "", NEMO_SIGMA) # "medio" not "un medio" fraction = fraction_with_one | fraction_default | fraction_with_cardinal graph_masc = pynini.closure(integer + delete_space + conjunction, 0, 1) + fraction # Manage cases of fem gender (only shows on integer except for "medio") integer_fem = shift_cardinal_gender(integer) fraction_default |= ( shift_cardinal_gender(numerator) + delete_space + insert_space + (denominator_plural @ pynini.cross("medios", "medias"))) fraction_with_one |= ( pynutil.delete(numerator_one) + delete_space + (denominator_singular @ pynini.cross("medio", "media"))) fraction_fem = fraction_with_one | fraction_default | fraction_with_cardinal graph_fem = pynini.closure(integer_fem + delete_space + conjunction, 0, 1) + fraction_fem self.graph_masc = pynini.optimize(graph_masc) self.graph_fem = pynini.optimize(graph_fem) self.graph = graph_masc | graph_fem delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def get_pos_string(fsa, min_len, max_len): fsa_dict = {} for i in range(min_len, max_len + 1): fsa_dict[i] = pynini.intersect(fsa, pynini.closure(sigma, i, i)) # print(list_string_set(fsa_dict[i])) return fsa_dict
def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electonic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) if not deterministic: serial_graph = SerialFst(cardinal, deterministic=deterministic).fst classify |= pynutil.add_weight(serial_graph, 1.1) classify = classify.optimize() punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize()
def get_neg_string(fsa, min_len, max_len): fsa_dict = {} for i in range(min_len, max_len + 1): fsa_dict[i] = pynini.difference(pynini.closure(sigma, i, i), fsa) # print(list_string_set(fsa_dict[i])) return fsa_dict
def lg_containing_ssq(x, i): return (pynini.closure(sigmaStar + x + sigmaStar, i, i)).minimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x.lower(), y] for x, y in whitelist] else: whitelist = [[x, y] for x, y in whitelist] if keep_punct_add_end: whitelist.extend(augment_labels_with_punct_at_end(whitelist)) graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv")) graph |= _get_whitelist_graph(input_case, get_abs_path("data/whitelist/symbol.tsv")) if deterministic: names = get_names() graph |= ( pynini.cross(pynini.union("st", "St", "ST"), "Saint") + pynini.closure(pynutil.delete(".")) + pynini.accep(" ") + names ) else: graph |= _get_whitelist_graph( input_case, get_abs_path("data/whitelist/alternatives.tsv"), keep_punct_add_end=True ) for x in [".", ". "]: graph |= ( NEMO_UPPER + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2) + pynini.closure(pynutil.delete("."), 0, 1) ) if not deterministic: multiple_forms_whitelist_graph = get_formats(get_abs_path("data/whitelist/alternatives_all_format.tsv")) graph |= multiple_forms_whitelist_graph graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) | pynini.string_file( get_abs_path("data/measure/unit_alternatives.tsv") ) graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL units_graph = pynini.compose(NEMO_CHAR ** (3, ...), convert_space(graph_unit | graph_unit_plural)) graph |= units_graph # convert to states only if comma is present before the abbreviation to avoid converting all caps words, # e.g. "IN", "OH", "OK" # TODO or only exclude above? states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: if input_case == "lower_cased": x = x.lower() additional_options.append((x, f"{y[0]}.{y[1:]}")) if not deterministic: additional_options.append((x, f"{y[0]}.{y[1:]}.")) states.extend(additional_options) state_graph = pynini.string_map(states) graph |= pynini.closure(NEMO_NOT_SPACE, 1) + pynini.union(", ", ",") + pynini.invert(state_graph).optimize() if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided self.graph = (convert_space(graph)).optimize() self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self): super().__init__(name="word", kind="classify") word = pynutil.insert("name: \"") + pynini.closure( NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize()
def __init__(self, min_word_constraint: str, name: str, cont_classes: List[Tuple[Optional[str], float]], alphabet: Dict[str, List[str]] = {}, start: bool = False): """ Converts a limited PCRE regex (scope, quantification) to an OpenFst FST. Substitutes phoneme classes with symbols. Assumes long vowels have been expanded. Unlike Slot, a StemGuesser's FST is eagerly evaluated Args: min_word_constraint: str a minimal word constraint expressed as a limited regular expression of phone classes name: str name of the StemGuesser Slot cont_classes: list[tuple[str, float]] list of continuation classes and their weights example: [('PluralSuffix', 0.8), (None, 0.5)] The StemGuesser's destination state is a final state if None is present in the list A StemGuesser can be both a terminal and non-terminal class Empty list of continuation classes are not allowed alphabet: dict[str, list[str]], optional dictionary mapping phone classes to list of symbols; if sigma (.) is used in the regex, alphabet is required start: bool, optional the slot is one of root slots (root class in LEXC) """ # phone classes could overlap so phones to set first symbols = { symb for symbol_class in alphabet.values() for symb in symbol_class } stack = [] # check for matching parens fst = None fst_stack = [] # to be used in union or scope mode regex = min_word_constraint # () means scope / grouping - concatenation # [] means match anything inside - union # . means match any character in the alphabet (not including epsilon) - sigma # quantifiers: ?, *, + for i in range(len(regex)): if regex[i] == '[': stack.append(regex[i]) fst_stack.append(('union', pynini.accep(''))) elif regex[i] == '(': stack.append(regex[i]) fst_stack.append(('scope', pynini.accep(''))) elif regex[i] == ')': if stack.pop(-1) != '(': raise Exception('Unmatched parentheses') fst_stack[-1] = ('processed', fst_stack[-1][1]) elif regex[i] == ']': if stack.pop(-1) != '[': raise Exception('Unmatched brackets') fst_stack[-1] = ('processed', fst_stack[-1][1]) elif fst_stack and fst_stack[-1][0] in ['scope', 'union']: if fst_stack[-1][0] == 'scope': # concatenate only the current chars if regex[i] not in alphabet: fst_stack[-1][1].concat(regex[i]) else: fst_stack[-1][1].concat( pynini.union(*alphabet[regex[i]])) elif fst_stack[-1][0] == 'union': if fst_stack[-1][1].num_states() == 1: # make sure we don't union with empty string if regex[i] not in alphabet: fst_stack[-1][1].concat(regex[i]) else: fst_stack[-1][1].concat( pynini.union(*alphabet[regex[i]])) else: # union only the current chars within the matching parens if regex[i] not in alphabet: fst_stack[-1][1].union(regex[i]) else: fst_stack[-1][1].union( pynini.union(*alphabet[regex[i]])) # sigma elif regex[i] == '.': if not alphabet: raise Exception( 'Alphabet required if regex includes sigma') # make copy each time to avoid state issues sigma = pynini.union(*list(symbols)) fst_stack.append(('sigma', sigma)) # quantification - perform closure on last FST elif regex[i] == '?': if i == 0: raise Exception('Empty quantification') fst_stack[-1] = (fst_stack[-1][0], pynini.closure(fst_stack[-1][1], 0, 1)) elif regex[i] == '*': if i == 0: raise Exception('Empty quantification') fst_stack[-1] = (fst_stack[-1][0], pynini.closure(fst_stack[-1][1])) # if the entire regex is a Kleene closure or previous character is sigma, accept empty string too if (len(fst_stack) == 1 and i == len(regex) - 1) or ( fst_stack and fst_stack[-1][0] == 'sigma'): fst_stack[-1] = (fst_stack[-1][0], pynini.union(fst_stack[-1][1], '')) elif regex[i] == '+': if i == 0: raise Exception('Empty quantification') fst_stack[-1] = (fst_stack[-1][0], pynini.closure(fst_stack[-1][1], 1)) else: if regex[i] not in alphabet: fst_stack.append(('symbol', pynini.accep(regex[i]))) else: fst_stack.append( ('symbol', pynini.union(*alphabet[regex[i]]))) for (_, f) in fst_stack: if not fst: # first FST fst = f else: fst = fst + f if len(stack) > 0: raise Exception('Unmatched brackets') # upper/lower alphabet symbol transitions and weights not used by compiler rules = [('', '', cont_classes, 0.0)] super(StemGuesser, self).__init__(name, rules, start) self.fst = fst.optimize()
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=deterministic).graph # VERBALIZERS cardinal = vCardinal(deterministic=deterministic) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=deterministic) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=deterministic) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=deterministic) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=deterministic).fst v_electronic_graph = vElectronic(deterministic=deterministic).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=deterministic).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst v_abbreviation = vAbbreviation(deterministic=deterministic).fst classify_and_verbalize = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(pynini.compose(time_graph, v_time_graph), 1.1) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), 1.1) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), 1.1) | pynutil.add_weight( pynini.compose(cardinal_graph, v_cardinal_graph), 1.1) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), 1.1) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), 1.1) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), 1.1) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), 1.1) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(pynini.compose(date_graph, v_date_graph), 1.09)).optimize() if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), 100) abbreviation_graph = AbbreviationFst( whitelist=whitelist, deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(abbreviation_graph, v_abbreviation), 100) punct = pynutil.add_weight(punct_graph, weight=1.1) token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) graph_decimal_masc = decimal.delete_tokens(decimal.graph_masc) graph_decimal_fem = decimal.delete_tokens(decimal.graph_fem) graph_cardinal_masc = cardinal.delete_tokens(cardinal.graph_masc) graph_cardinal_fem = cardinal.delete_tokens(cardinal.graph_fem) graph_fraction_fem = fraction.delete_tokens(fraction.graph_fem) graph_fraction_masc = fraction.delete_tokens(fraction.graph_masc) unit_masc = (unit_plural_masc | unit_singular_masc) + pynini.closure( NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1) unit_masc |= "por" + pynini.closure(NEMO_NOT_QUOTE, 1) unit_masc = pynutil.delete("units: \"") + ( pynini.closure(NEMO_NOT_QUOTE) @ unit_masc) + pynutil.delete("\"") unit_fem = (unit_plural_fem | unit_singular_fem) + pynini.closure( NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1) unit_fem = pynutil.delete("units: \"") + ( pynini.closure(NEMO_NOT_QUOTE) @ unit_fem) + pynutil.delete("\"") graph_masc = (graph_cardinal_masc | graph_decimal_masc) + NEMO_WHITE_SPACE + unit_masc graph_masc |= graph_fraction_masc + NEMO_WHITE_SPACE + pynutil.insert( "de ") + unit_masc graph_masc |= pynutil.add_weight( graph_fraction_masc @ (NEMO_SIGMA + pynini.union("medio", "medios")) + NEMO_WHITE_SPACE + unit_masc, -0.001) # "medio litro" not "medio de litro" graph_fem = (graph_cardinal_fem | graph_decimal_fem) + NEMO_WHITE_SPACE + unit_fem graph_fem |= graph_fraction_fem + NEMO_WHITE_SPACE + pynutil.insert( "de ") + unit_fem graph_fem |= pynutil.add_weight( graph_fraction_fem @ (NEMO_SIGMA + pynini.union("media", "medias")) + NEMO_WHITE_SPACE + unit_fem, -0.001) graph = graph_masc | graph_fem graph = (pynini.cdrewrite( pynutil.insert(" de"), "quantity: \"" + pynini.closure(NEMO_NOT_QUOTE, 1), "\"", NEMO_SIGMA) @ graph ) # billones de xyz graph @= pynini.cdrewrite(pynini.cross(ones, "uno"), "", NEMO_WHITE_SPACE + "por", NEMO_SIGMA) # To manage alphanumeric combonations ("a-8, 5x"), we let them use a weighted default path. alpha_num_unit = pynutil.delete("units: \"") + pynini.closure( NEMO_NOT_QUOTE) + pynutil.delete("\"") graph_alpha_num = pynini.union( (graph_cardinal_masc | graph_decimal_masc) + NEMO_SPACE + alpha_num_unit, alpha_num_unit + delete_extra_space + (graph_cardinal_masc | graph_decimal_masc), ) graph |= pynutil.add_weight(graph_alpha_num, 0.01) graph += delete_preserve_order delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit_singular = pynini.string_file( get_abs_path("data/currency_singular.tsv")) unit_singular = pynini.invert(unit_singular) unit_plural = pynini.string_file( get_abs_path("data/currency_plural.tsv")) unit_plural = pynini.invert(unit_plural) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = ( pynutil.insert("morphosyntactic_features: \",\"" ) # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "un") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete(pynini.union("centavos", "céntimos")), pynini.cross("un", "01") + delete_space + pynutil.delete(pynini.union("centavo", "céntimo")), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure( (pynutil.delete("con") | pynutil.delete('y')) + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer # setenta y cinco dólares con sesenta y tres~$75,63 optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("morphosyntactic_features: \",\"" ) # always use a comma in the decimal + insert_space + pynutil.insert("fractional_part: \"") + pynini.closure( (pynutil.delete("con") | pynutil.delete('y')) + delete_space, 0, 1) + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "un" - "una") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= ( pynutil.insert("integer_part: \"") + (pynini.cross("un", "1") | pynini.cross("una", "1")) + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period graph_hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv")) graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv")) graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv")) graph_minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv")) time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) graph_half = pynini.cross("rưỡi", "30") oclock = pynini.cross("giờ", "") minute = pynini.cross("phút", "") optional_minute = pynini.closure(delete_space + minute, 0, 1) second = pynini.cross("giây", "") final_graph_hour = pynutil.insert('hours: "') + graph_hours + pynutil.insert('"') + delete_space + oclock graph_minute = graph_minutes + optional_minute graph_second = graph_minutes + delete_space + second final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert('zone: "') + convert_space(time_zone_graph) + pynutil.insert('"'), 0, 1, ) graph_hm = ( final_graph_hour + delete_extra_space + pynutil.insert('minutes: "') + (graph_minute | graph_half) + pynutil.insert('"') ) graph_hms = ( final_graph_hour + delete_extra_space + pynutil.insert('minutes: "') + graph_minutes + delete_space + minute + pynutil.insert('"') + delete_extra_space + pynutil.insert('seconds: "') + graph_second + pynutil.insert('"') ) graph_ms = ( pynutil.insert('minutes: "') + graph_minutes + delete_space + minute + pynutil.insert('"') + delete_extra_space + pynutil.insert('seconds: "') + (graph_second | graph_half) + pynutil.insert('"') ) graph_hours_to_component = graph_hours @ graph_hours_to graph_minutes_to_component = graph_minutes @ graph_minutes_to graph_time_to = ( pynutil.insert('hours: "') + graph_hours_to_component + pynutil.insert('"') + delete_space + oclock + delete_space + pynutil.delete("kém") + delete_extra_space + pynutil.insert('minutes: "') + graph_minutes_to_component + pynutil.insert('"') + optional_minute ) final_graph = (final_graph_hour | graph_hm | graph_hms) + final_time_zone_optional final_graph |= graph_ms final_graph |= graph_time_to final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify") far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "_en_itn.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst() cardinal_graph = cardinal.fst fraction = FractionFst(cardinal) fraction_graph = fraction.fst ordinal = OrdinalFst() ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal) decimal_graph = decimal.fst measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst date_graph = DateFst(cardinal=cardinal).fst word_graph = WordFst().fst time_graph = TimeFst().fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst whitelist_graph = WhiteListFst().fst punct_graph = PunctuationFst().fst electronic_graph = ElectronicFst().fst telephone_graph = TelephoneFst().fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.05) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.08) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.09) | pynutil.add_weight(money_graph, 1.07) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, deterministic: bool = True): super().__init__(name="time", kind="verbalize", deterministic=deterministic) hour = ( pynutil.delete("hours:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) minute = ( pynutil.delete("minutes:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) suffix = ( pynutil.delete("suffix:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) optional_suffix = pynini.closure(delete_space + insert_space + suffix, 0, 1) zone = ( pynutil.delete("zone:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) optional_zone = pynini.closure(delete_space + insert_space + zone, 0, 1) second = ( pynutil.delete("seconds:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) graph_hms = ( hour + pynutil.insert(" hours ") + delete_space + minute + pynutil.insert(" minutes and ") + delete_space + second + pynutil.insert(" seconds") + optional_suffix + optional_zone ) graph_hms @= pynini.cdrewrite( pynutil.delete("o ") | pynini.cross("one minutes", "one minute") | pynini.cross("one seconds", "one second") | pynini.cross("one hours", "one hour"), pynini.union(" ", "[BOS]"), "", NEMO_SIGMA, ) graph = hour + delete_space + insert_space + minute + optional_suffix + optional_zone graph |= hour + insert_space + pynutil.insert("o'clock") + optional_zone graph |= hour + delete_space + insert_space + suffix + optional_zone graph |= graph_hms delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()