def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" punct_symbols_to_exclude = ["[", "]"] punct_unicode = [ chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude ] whitelist_symbols = load_labels( get_abs_path("data/whitelist/symbol.tsv")) whitelist_symbols = [x[0] for x in whitelist_symbols] self.punct_marks = [ p for p in punct_unicode + list(s) if p not in whitelist_symbols ] punct = pynini.union(*self.punct_marks) punct = pynini.closure(punct, 1) emphasis = (pynini.accep("<") + ( (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) + pynini.accep(">")) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) self.graph = punct self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize() graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) if not deterministic: phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(pynini.accep(" "), 0, 1) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.closure(pynini.accep(" "), 0, 1) + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) graph_digit_no_zero = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() graph_zero = pynini.cross("0", "zero") if not deterministic: graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh") graph_digit = graph_digit_no_zero | graph_zero graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize() default_chars_symbols = pynini.cdrewrite( pynutil.insert(" ") + (graph_symbols | graph_digit) + pynutil.insert(" "), "", "", NEMO_SIGMA ) user_name = ( pynutil.delete("username:"******"\"") + default_chars_symbols + pynutil.delete("\"") ) domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv")) domain = ( default_chars_symbols + insert_space + plurals._priority_union( domain_common, pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001), NEMO_SIGMA ) + pynini.closure( insert_space + (pynini.cdrewrite(TO_UPPER, "", "", NEMO_SIGMA) @ default_chars_symbols), 0, 1 ) ) domain = ( pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + domain + delete_space + pynutil.delete("\"") ).optimize() protocol = pynutil.delete("protocol: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph = ( pynini.closure(protocol + delete_space, 0, 1) + pynini.closure(user_name + delete_space + pynutil.insert(" at ") + delete_space, 0, 1) + domain + delete_space ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def get_four_digit_year_graph(deterministic: bool = True): """ Returns a four digit transducer which is combination of ties/teen or digits (using hundred instead of thousand format), e.g. 1219 -> twelve nineteen 3900 -> thirty nine hundred """ graph_ties = get_ties_graph(deterministic) graph_with_s = ( (graph_ties + insert_space + graph_ties) | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten"))) ) + pynutil.delete("0s") graph_with_s |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") + pynutil.delete("s") graph_with_s = graph_with_s @ pynini.cdrewrite( pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA ) graph = graph_ties + insert_space + graph_ties graph |= (graph_teen | graph_ties) + insert_space + pynini.cross("00", "hundred") thousand_graph = ( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit) ) thousand_graph |= ( graph_digit + insert_space + pynini.cross("000", "thousand") + pynini.closure(pynutil.delete(" "), 0, 1) + pynini.accep("s") ) graph |= graph_with_s if deterministic: graph = plurals._priority_union(thousand_graph, graph, NEMO_SIGMA) else: graph |= thousand_graph return graph.optimize()
def singular_to_plural(): # plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n") _en = ( NEMO_SIGMA + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung") + pynutil.insert("en") ) _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e") | pynutil.insert("nen")) _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en") # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör _e = NEMO_SIGMA + pynini.union("eur", "ich", "ier", "ig", "ling", "ör") + pynutil.insert("e") _s = NEMO_SIGMA + pynini.union("a", "i", "o", "u", "y") + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, pynini.union(_n, _en, _nen, _fremd, _e, _s), NEMO_SIGMA ).optimize() return graph_plural
def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" punct_unicode = [ chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in "[]" ] punct = pynini.union(*s) | pynini.union(*punct_unicode) emphasis = ( pynini.accep("<") + ( (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) ) + pynini.accep(">") ) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) self.graph = punct self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) punct = PunctuationFst().graph self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) if not deterministic: self.graph = pynini.closure( pynini.difference( self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1) ), 1, ) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def _get_two_digit_year(cardinal_graph, single_digits_graph): wo_digit_year = NEMO_DIGIT ** (2) @ plurals._priority_union(cardinal_graph, single_digits_graph, NEMO_SIGMA) return wo_digit_year
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural PLURAL_TO_SINGULAR = pynini.invert(graph_plural) TO_LOWER = pynini.union(*[ pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) ]) TO_UPPER = pynini.invert(TO_LOWER) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Create placeholders NEMO_CHAR = None
# plural endung n/en maskuline Nomen mit den Endungen e, ent, and, ant, ist, or _n = NEMO_SIGMA + pynini.union("e") + pynutil.insert("n") _en = (NEMO_SIGMA + pynini.union("ent", "and", "ant", "ist", "or", "ion", "ik", "heit", "keit", "schaft", "tät", "ung") + pynutil.insert("en")) _nen = NEMO_SIGMA + pynini.union("in") + (pynutil.insert("e") | pynutil.insert("nen")) _fremd = NEMO_SIGMA + pynini.union("ma", "um", "us") + pynutil.insert("en") # maskuline Nomen mit den Endungen eur, ich, ier, ig, ling, ör _e = NEMO_SIGMA + pynini.union("eur", "ich", "ier", "ig", "ling", "ör") + pynutil.insert("e") _s = NEMO_SIGMA + pynini.union("a", "i", "o", "u", "y") + pynutil.insert("s") graph_plural = plurals._priority_union( suppletive, pynini.union(_n, _en, _nen, _fremd, _e, _s), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural PLURAL_TO_SINGULAR = pynini.invert(graph_plural) TO_LOWER = pynini.union(*[ pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) ]) TO_UPPER = pynini.invert(TO_LOWER) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Create placeholders NEMO_CHAR = None
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="integer", kind="classify", deterministic=deterministic) """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = pynini.compose(NEMO_DIGIT**(6, ...), cardinal.single_digits_graph).optimize() num_graph |= pynini.compose(NEMO_DIGIT**(1, 5), cardinal.graph).optimize() # to handle numbers starting with zero num_graph |= pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize() # TODO: "#" doesn't work from the file symbols_graph = pynini.string_file( get_abs_path("data/whitelist/symbol.tsv")).optimize( ) | pynini.cross("#", "hash") num_graph |= symbols_graph if not self.deterministic and not lm: num_graph |= cardinal.single_digits_graph # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( NEMO_DIGIT**2 @ cardinal. graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001) # add space between letter and digit/symbol symbols = [ x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv")) ] symbols = pynini.union(*symbols) digit_symbol = NEMO_DIGIT | symbols graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure( delimiter + num_graph + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() + alphas) serial_graph = letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) # 2+ symbols serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) # exclude ordinal numbers from serial options serial_graph = pynini.compose( pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph).optimize() serial_graph = pynutil.add_weight(serial_graph, 0.0001) serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()) # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values serial_graph = ( pynini.closure((serial_graph | num_graph | alphas) + delimiter) + serial_graph + pynini.closure(delimiter + (serial_graph | num_graph | alphas))) serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize() serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() self.graph = serial_graph.optimize() graph = pynutil.insert("name: \"") + convert_space( self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}_lm.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=True, lm=True) cardinal_tagger = cardinal cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=True) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=True) decimal_graph = decimal.fst fraction = FractionFst(deterministic=True, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=True) measure_graph = measure.fst date = DateFst(cardinal=cardinal, deterministic=True, lm=True) date_graph = date.fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=True).fst telephone_graph = TelephoneFst(deterministic=True).fst electronic_graph = ElectronicFst(deterministic=True).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=False).fst whitelist = WhiteListFst(input_case=input_case, deterministic=False, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=True).graph serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic, lm=True).fst # VERBALIZERS cardinal = vCardinal(deterministic=True) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=True) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=True) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=True, lm=True) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=True).fst v_electronic_graph = vElectronic(deterministic=True).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=False) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=True).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic, lm=True).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst time_final = pynini.compose(time_graph, v_time_graph) cardinal_or_date_final = plurals._priority_union( date_graph, cardinal_graph, NEMO_SIGMA) cardinal_or_date_final = pynini.compose( cardinal_or_date_final, (v_cardinal_graph | v_date_graph)) sem_w = 1 word_w = 100 punct_w = 2 classify_and_verbalize = ( pynutil.add_weight(time_final, sem_w) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), sem_w) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), sem_w) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), sem_w) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), sem_w) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), sem_w) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), sem_w) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), sem_w) | pynutil.add_weight(cardinal_or_date_final, sem_w) | pynutil.add_weight(whitelist_graph, sem_w) | pynutil.add_weight( pynini.compose(serial_graph, v_cardinal_graph), 1.1001) # should be higher than the rest of the classes ).optimize() roman_graph = RomanFst(deterministic=deterministic, lm=True).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), sem_w) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst(time=time_final, cardinal=cardinal_tagger, date=date_final, deterministic=deterministic).fst v_word_graph = vWord(deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(range_graph, v_word_graph), sem_w) classify_and_verbalize = pynutil.insert( "< ") + classify_and_verbalize + pynutil.insert(" >") classify_and_verbalize |= pynutil.add_weight(word_graph, word_w) punct_only = pynutil.add_weight(punct_graph, weight=punct_w) punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct_only), 1, ) def get_token_sem_graph(classify_and_verbalize): token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph |= punct_only + pynini.closure(punct) graph = delete_space + graph + delete_space remove_extra_spaces = pynini.closure( NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)) remove_extra_spaces |= ( pynini.closure(pynutil.delete(" "), 1) + pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))) graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize() return graph self.fst = get_token_sem_graph(classify_and_verbalize) no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = NEMO_DIGIT**( 1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit) # to handle the rest of the numbers address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num) address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA) direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) + pynini.closure( pynutil.delete("."), 0, 1) direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1) address_words = get_formats( get_abs_path("data/address/address_word.tsv")) address_words = ( pynini.accep(NEMO_SPACE) + (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE + pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: additional_options.append((x, f"{y[0]}.{y[1:]}")) states.extend(additional_options) state_graph = pynini.string_map(states) state = pynini.invert(state_graph) state = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1, ) address = address_num + direction + address_words + pynini.closure( city + state + zip_code, 0, 1) address |= address_num + direction + address_words + pynini.closure( pynini.cross(".", ""), 0, 1) return address
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") # French frequently compounds numbers with hyphen. delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1)) insert_hyphen = pynutil.insert("-") suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) _s = NEMO_SIGMA + pynutil.insert("s") _x = NEMO_SIGMA + pynini.string_map([("eau"), ("eu"), ("ou")]) + pynutil.insert("x") _aux = NEMO_SIGMA + pynini.string_map([("al", "aux"), ("ail", "aux")]) graph_plural = plurals._priority_union( suppletive, plurals._priority_union(_s, pynini.union(_x, _aux), NEMO_SIGMA), NEMO_SIGMA).optimize() SINGULAR_TO_PLURAL = graph_plural PLURAL_TO_SINGULAR = pynini.invert(graph_plural) TO_LOWER = pynini.union(*[ pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) ]) TO_UPPER = pynini.invert(TO_LOWER) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Create placeholders NEMO_CHAR = None
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.lm = lm self.deterministic = deterministic # TODO replace to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/number/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph = (pynini.closure(NEMO_DIGIT, 1, 3) + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3))) @ graph self.graph = graph self.graph_with_and = self.add_optional_and(graph) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize() cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose( pynini.closure(NEMO_DIGIT), self.graph_with_and)) # add small weight to non-default graphs to make sure the deterministic option is listed first final_graph = ( self.graph_with_and | pynutil.add_weight(self.single_digits_graph, 0.0001) | get_four_digit_year_graph( ) # allows e.g. 4567 be pronouced as forty five sixty seven | pynutil.add_weight(single_digits_graph_with_commas, 0.0001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) suffix = OrdinalFst().suffix integer = pynutil.delete("integer_part: \"") + pynini.closure( NEMO_NOT_QUOTE) + pynutil.delete("\" ") denominator_one = pynini.cross("denominator: \"one\"", "over one") denominator_half = pynini.cross("denominator: \"two\"", "half") denominator_quarter = pynini.cross("denominator: \"four\"", "quarter") denominator_rest = (pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) @ suffix + pynutil.delete("\"")) denominators = plurals._priority_union( denominator_one, plurals._priority_union( denominator_half, plurals._priority_union(denominator_quarter, denominator_rest, NEMO_SIGMA), NEMO_SIGMA, ), NEMO_SIGMA, ).optimize() if not deterministic: denominators |= pynutil.delete("denominator: \"") + ( pynini.accep("four") @ suffix) + pynutil.delete("\"") numerator_one = pynutil.delete("numerator: \"") + pynini.accep( "one") + pynutil.delete("\" ") numerator_one = numerator_one + insert_space + denominators numerator_rest = ( pynutil.delete("numerator: \"") + (pynini.closure(NEMO_NOT_QUOTE) - pynini.accep("one")) + pynutil.delete("\" ")) numerator_rest = numerator_rest + insert_space + denominators numerator_rest @= pynini.cdrewrite( plurals._priority_union(pynini.cross("half", "halves"), pynutil.insert("s"), NEMO_SIGMA), "", "[EOS]", NEMO_SIGMA, ) graph = numerator_one | numerator_rest conjunction = pynutil.insert("and ") if not deterministic and not lm: conjunction = pynini.closure(conjunction, 0, 1) integer = pynini.closure(integer + insert_space + conjunction, 0, 1) graph = integer + graph graph @= pynini.cdrewrite( pynini.cross("and one half", "and a half") | pynini.cross("over ones", "over one"), "", "[EOS]", NEMO_SIGMA) self.graph = graph delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()