def add_optional_and(self, graph): if not self.deterministic: graph = pynini.compose( graph, NEMO_SIGMA + pynini.closure(pynini.cross("hundred ", " "), 0, 1) + NEMO_SIGMA) not_quote = pynini.closure(NEMO_NOT_QUOTE) no_thousand_million = pynini.difference( not_quote, not_quote + pynini.union("thousand", "million") + not_quote).optimize() integer = (not_quote + pynutil.add_weight( pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001)).optimize() no_hundred = pynini.difference( NEMO_SIGMA, not_quote + pynini.accep("hundred") + not_quote).optimize() integer |= (not_quote + pynutil.add_weight( pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001)).optimize() graph_with_and = pynini.compose( graph, integer).optimize() | pynutil.add_weight(graph, 0.00001) return graph_with_and
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) self.graph = cardinal.single_digits_graph.optimize() if not deterministic: self.graph = self.graph | cardinal_graph point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") self.graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + self.graph_fractional) self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit) # reduce options for non_deterministic and allow either "oh" or "zero", but not combination if not deterministic: no_oh_zero = pynini.difference( NEMO_SIGMA, (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA) | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA), ).optimize() no_zero_oh = pynini.difference( NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA + pynini.accep("oh") + NEMO_SIGMA).optimize() self.final_graph_wo_negative |= pynini.compose( self.final_graph_wo_negative, pynini.cdrewrite( pynini.cross("integer_part: \"zero\"", "integer_part: \"oh\""), NEMO_SIGMA, NEMO_SIGMA, NEMO_SIGMA), ) self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_oh_zero).optimize() self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_zero_oh).optimize() final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def cody_rand_gen_no_duplicate(acceptor, n): loop = 50000 result_set = set() seed = 0 for i in range(loop): print('started loop ' + str(i)) num = int(n + n * i * 0.1) temp = pynini.randgen(acceptor, npath=num, seed=seed, select='uniform', max_length=2147483647, weighted=False) rand_list = list_string_set(temp) result_set = result_set.union(set(rand_list)) uniq_len = len(result_set) if uniq_len < n and i < loop - 1: print('insufficient random strings') seed += 1 continue else: rand_list = list(result_set) random.shuffle(rand_list) rand_list = rand_list[:n] rand_list.sort() acceptor = pynini.difference(acceptor, temp) print('returning') if len(rand_list) >= n: print('got full rand_list\n') return acceptor, rand_list
def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. một triệu -> integer_part: "1" quantity: "triệu" e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn") graph_four = pynini.cross("tư", "4") graph_one = pynini.cross("mốt", "1") graph_half = pynini.cross("rưỡi", "5") optional_fraction_graph = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + (graph_digit | graph_half | graph_one | graph_four) + pynutil.insert("\""), 0, 1, ) res = (pynutil.insert("integer_part: \"") + numbers + pynutil.insert("\"") + delete_extra_space + pynutil.insert("quantity: \"") + suffix + pynutil.insert("\"") + optional_fraction_graph) res |= (decimal + delete_extra_space + pynutil.insert("quantity: \"") + (suffix | "ngàn" | "nghìn") + pynutil.insert("\"")) return res
def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case) units_graph = _get_whitelist_graph(input_case, file="data/measurements.tsv") # do not replace single letter units, like `м` or `°` units_graph = pynini.compose( pynini.difference(pynini.project(units_graph, "input"), NEMO_ALPHA), units_graph) graph |= units_graph.optimize() graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike') -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. eine million -> integer_part: "1" quantity: "million" e.g. eins komma vier millionen -> integer_part: "1" fractional_part: "4" quantity: "millionen" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union( "million", "millionen", "milliarde", "milliarden", "billion", "billionen", "billiarde", "billiarden", "trillion", "trillionen", "trilliarde", "trilliarden", ) res = (pynutil.insert("integer_part: \"") + numbers + pynutil.insert("\"") + delete_extra_space + pynutil.insert("quantity: \"") + suffix + pynutil.insert("\"")) res |= decimal + delete_extra_space + pynutil.insert( "quantity: \"") + suffix + pynutil.insert("\"") return res
def create_data_with_duplicate(filename, pos_dict, neg_dict, min_len, max_len, num, get_difference): with open(filename, "w+") as f: for i in range(min_len, max_len + 1): pos_fsa = \ pynini.randgen(pos_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False) if get_difference == 1: pos_dict[i] = pynini.difference(pos_dict[i], pos_fsa) for ele in list_string_set(pos_fsa): f.write(ele + "\t" + "TRUE\n") neg_fsa = \ pynini.randgen(neg_dict[i], npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False) if get_difference == 1: neg_dict[i] = pynini.difference(neg_dict[i], neg_fsa) for ele in list_string_set(neg_fsa): f.write(ele + "\t" + "FALSE\n") return pos_dict, neg_dict
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize() graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) if not deterministic: phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(pynini.accep(" "), 0, 1) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.closure(pynini.accep(" "), 0, 1) + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) one_thousand_alternative = alternative_formats['one_thousand_alternative'] separators = alternative_formats['separators'] ordinal = number_names['ordinal_number_names'] ordinal |= ordinal @ one_thousand_alternative ordinal_numbers = separators @ ordinal # to handle cases like 2-ая endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv")) not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-")) del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA) ordinal_numbers_marked = ( ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize() @ (NEMO_SIGMA + endings).optimize() @ del_ending ).optimize() self.ordinal_numbers = ordinal_numbers # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize() final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize() final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) dot = pynini.accep(".") # A.B.C. -> A. B. C. graph = NEMO_UPPER + dot + pynini.closure( insert_space + NEMO_UPPER + dot, 1) # A.B.C. -> A.B.C. graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1) # ABC -> ABC graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1) # ABC -> A B C graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) misc_graph = pynutil.add_weight( TO_LOWER + pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)), 110) misc_graph |= pynutil.add_weight( pynini.closure(NEMO_UPPER, 2) + pynini.closure(insert_space + NEMO_LOWER, 1), 110) misc_graph |= ( NEMO_UPPER + pynutil.delete(".") + pynini.closure(insert_space + NEMO_UPPER + pynutil.delete("."))) misc_graph |= pynutil.add_weight( TO_LOWER + pynutil.delete(".") + pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110) # set weight of the misc graph to the value higher then word graph = pynutil.add_weight(main_graph.optimize(), 10) | pynutil.add_weight( misc_graph.optimize(), 101) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if input_file: graph = _get_whitelist_graph(input_case, input_file) units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measurements.tsv")) # do not replace single letter units, like `м`, `°` and `%` will be replaced units_graph = pynini.compose((NEMO_CHAR ** (2, ...) | pynini.difference(NEMO_CHAR, RU_ALPHA)), units_graph) graph |= units_graph.optimize() graph |= TO_CYRILLIC + pynini.closure(pynutil.insert(" ") + TO_CYRILLIC) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def alternate_rand_gen_no_duplicate(acceptor, n): rand_list = [] loop = 10 seed = 0 for i in range(loop): print('(alternate) trying to generate random strings (' + str(i) + ')') num = int(n + n * i * .01) temp = pynini.randgen(acceptor, npath=num, seed=seed, select='uniform', max_length=2147483647, weighted=False) print('made new `temp`') temp_list = list_string_set(temp) print('temp got ' + str(len(temp_list)) + ' random strings') temp_list = list(set(temp_list)) new_strings = [t for t in temp_list if t not in rand_list] print('got ' + str(len(new_strings)) + ' new strings') for t in temp_list: if t not in rand_list: rand_list.append(t) if len(rand_list) == n: print('rand_list now has ' + str(len(rand_list)) + ' strings') print('finally got enough strings in rand_list; i=' + str(i)) return acceptor, rand_list acceptor = pynini.difference(acceptor, temp) seed += 1 print('rand_list now has ' + str(len(rand_list)) + ' strings') print('need to add strings to rand_list (' + str(i) + ')') print('finished loop; returning incomplete set') return acceptor, rand_list
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) punct = PunctuationFst().graph self.graph = pynini.closure( pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) if not deterministic: self.graph = pynini.closure( pynini.difference( self.graph, pynini.union("$", "€", "₩", "£", "¥") + pynini.closure(NEMO_DIGIT, 1)), 1) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def getNegString(fsa, min_len, max_len): fsa_dict = {} neg_str_dict = {} for i in range(min_len, max_len + 1): fsa_dict[i] = pynini.difference(pynini.closure(sigma, i, i), fsa) neg_str_dict[i] = list( np.random.permutation(listStringSet(fsa_dict[i]))) print(neg_str_dict[i]) return neg_str_dict
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) optional_sign = cardinal.optional_sign unit = ( pynutil.delete("units: \"") + pynini.difference(pynini.closure(NEMO_CHAR - " ", 1), pynini.union("address", "math")) + pynutil.delete("\"") + delete_space ) graph_decimal = ( pynutil.delete("decimal {") + delete_space + optional_sign + delete_space + decimal.numbers + delete_space + pynutil.delete("}") ) graph_cardinal = ( pynutil.delete("cardinal {") + delete_space + optional_sign + delete_space + cardinal.numbers + delete_space + pynutil.delete("}") ) graph_fraction = ( pynutil.delete("fraction {") + delete_space + fraction.graph + delete_space + pynutil.delete("}") ) graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + insert_space + unit # SH adds "preserve_order: true" by default preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space graph |= unit + insert_space + (graph_cardinal | graph_decimal) + delete_space + pynini.closure(preserve_order) address = ( pynutil.delete("units: \"address\" ") + delete_space + graph_cardinal + delete_space + pynini.closure(preserve_order) ) math = ( pynutil.delete("units: \"math\" ") + delete_space + graph_cardinal + delete_space + pynini.closure(preserve_order) ) graph |= address | math delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def border(fsa, n): cofsa = pynini.difference(sigmaStar, fsa) cofsa.optimize() bpairs = fsa @ editExactly1 @ cofsa # this is the key insight which gives entire border bpairs.optimize() sigmaN = pynini.closure(sigma, n, n) sigmaN.optimize() bpairsN = sigmaN @ bpairs # here we limit the border to input words of length=n bpairsN.optimize() return bpairsN
def make_byte_star_except_boundary( boundary: pynini.FstLike = "+") -> pynini.Fst: """Helper function to make sigma-star over bytes, minus the boundary symbol. Args: boundary: a string, the boundary symbol to use. Returns: An acceptor representing sigma-star over bytes, minus the boundary symbol. """ return pynini.difference(byte.BYTE, boundary).closure().optimize()
def get_quantity(deci): numbers = cardinal.graph_hundred_component_at_least_one_none_zero_digit @ ( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union("million", "billion", "trillion", "quadrillion", "quintillion", "sextillion") res = (pynutil.insert("integer_part: \"") + numbers + pynutil.insert("\"") + delete_extra_space + pynutil.insert("quantity: \"") + suffix + pynutil.insert("\"")) res |= deci + delete_extra_space + pynutil.insert("quantity: \"") + ( suffix | "thousand") + pynutil.insert("\"") return res
def border(fsa, n): ''' A function that takes an fsa and produces an fst; the fst converts strings of length n in the language to "border" strings, which are 1 edit off from being in the language ''' cofsa = pynini.difference(sigmaStar, fsa) cofsa.optimize() bpairs = fsa @ editExactly1 @ cofsa # this is the key insight which gives entire border bpairs.optimize() sigmaN = pynini.closure(sigma, n, n) sigmaN.optimize() bpairsN = sigmaN @ bpairs # here we limit the border to input words of length=n bpairsN.optimize() return bpairsN
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) punct = PunctuationFst().graph self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) if not deterministic: self.graph = pynini.closure( pynini.difference( self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1) ), 1, ) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def rand_gen_no_duplicate(acceptor, n): loop = 10 for i in range(loop): num = int(n + n*i*0.1) temp = pynini.randgen(acceptor, npath=num, seed=0, select="uniform", max_length=2147483647, weighted=False) rand_list = list_string_set(temp) rand_list = list(set(rand_list)) uniq_len = len(rand_list) if uniq_len < n and i < loop - 1: print('insufficient random strings') continue else: random.shuffle(rand_list) rand_list = rand_list[:n] rand_list.sort() acceptor = pynini.difference(acceptor, temp) return acceptor, rand_list
def __init__(self): super().__init__(name="cardinal", kind="classify") graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_quantity( decimal: 'pynini.FstLike', cardinal_up_to_thousand: 'pynini.FstLike') -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. one million -> integer_part: "1" quantity: "million" e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions Args: decimal: decimal FST cardinal_up_to_million: cardinal FST """ numbers = cardinal_up_to_thousand @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union( "million", "millions", "milliard", "milliards", "billion", "billions", "billiard", "billiards", "trillion", "trillions", "trilliard", "trilliards", ) res = ( pynutil.insert("integer_part: \"") + numbers + pynutil.insert("\"") + ( pynini.union(delete_hyphen, delete_extra_space) ) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal. + pynutil.insert(" quantity: \"") + suffix + pynutil.insert("\"")) res |= decimal + delete_extra_space + pynutil.insert( " quantity: \"") + suffix + pynutil.insert("\"") return res
def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. một triệu -> integer_part: "1" quantity: "triệu" e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn") graph_four = pynini.cross("tư", "4") graph_one = pynini.cross("mốt", "1") graph_half = pynini.cross("rưỡi", "5") last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input") last_digit = pynini.union( (pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit, graph_one, graph_four, graph_half, ) optional_fraction_graph = pynini.closure( delete_extra_space + pynutil.insert('fractional_part: "') + (last_digit | graph_half | graph_one | graph_four) + pynutil.insert('"'), 0, 1, ) res = (pynutil.insert('integer_part: "') + numbers + pynutil.insert('"') + delete_extra_space + pynutil.insert('quantity: "') + suffix + pynutil.insert('"') + optional_fraction_graph) res |= (decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "ngàn" | "nghìn") + pynutil.insert('"')) return res
def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) day_cardinal = pynutil.delete("day: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") day = day_cardinal @ pynini.cdrewrite( ordinal.ordinal_stem, "", "[EOS]", NEMO_SIGMA) + pynutil.insert("ter") months_names = pynini.union(*[ x[1] for x in load_labels(get_abs_path("data/months/abbr_to_name.tsv")) ]) month = pynutil.delete("month: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") final_month = month @ months_names final_month |= month @ pynini.difference( NEMO_SIGMA, months_names) @ pynini.cdrewrite( ordinal.ordinal_stem, "", "[EOS]", NEMO_SIGMA) + pynutil.insert("ter") year = pynutil.delete("year: \"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") # day month year graph_dmy = day + pynini.accep(" ") + final_month + pynini.closure( pynini.accep(" ") + year, 0, 1) graph_dmy |= final_month + pynini.accep(" ") + year self.graph = graph_dmy | year final_graph = self.graph + delete_preserve_order delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) deterministic_ordinal = OrdinalFst(cardinal=cardinal, deterministic=True) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=deterministic).graph serial_graph = SerialFst(cardinal=cardinal, ordinal=deterministic_ordinal, deterministic=deterministic).fst # VERBALIZERS cardinal = vCardinal(deterministic=deterministic) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=deterministic) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=deterministic) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=deterministic) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=deterministic).fst v_electronic_graph = vElectronic(deterministic=deterministic).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=deterministic).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst v_abbreviation = vAbbreviation(deterministic=deterministic).fst det_v_time_graph = vTime(deterministic=True).fst det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True), deterministic=True).fst time_final = pynini.compose(time_graph, det_v_time_graph) date_final = pynini.compose(date_graph, det_v_date_graph) range_graph = RangeFst(time=time_final, date=date_final, cardinal=CardinalFst(deterministic=True), deterministic=deterministic).fst v_word_graph = vWord(deterministic=deterministic).fst sem_w = 1 word_w = 100 punct_w = 2 classify_and_verbalize = ( pynutil.add_weight(whitelist_graph, sem_w) | pynutil.add_weight(pynini.compose(time_graph, v_time_graph), sem_w) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), sem_w) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), sem_w) | pynutil.add_weight( pynini.compose(cardinal_graph, v_cardinal_graph), sem_w) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), sem_w) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), sem_w) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), sem_w) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), sem_w) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), sem_w) | pynutil.add_weight(word_graph, word_w) | pynutil.add_weight(pynini.compose(date_graph, v_date_graph), sem_w - 0.01) | pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w) | pynutil.add_weight( pynini.compose(serial_graph, v_word_graph), 1.1001) # should be higher than the rest of the classes ).optimize() if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), word_w) abbreviation_graph = AbbreviationFst( whitelist=whitelist, deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(abbreviation_graph, v_abbreviation), word_w) punct_only = pynutil.add_weight(punct_graph, weight=punct_w) punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct_only), 1, ) token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph |= punct_only + pynini.closure(punct) graph = delete_space + graph + delete_space remove_extra_spaces = pynini.closure( NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)) remove_extra_spaces |= ( pynini.closure(pynutil.delete(" "), 1) + pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))) graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize() self.fst = graph no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(graph, no_digits).optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundred = pynini.cross("hundred", "") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit ) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) graph_trillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) graph_quadrillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) graph_quintillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) graph_sextillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = ( pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph ) self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
from pynini.examples import plurals from pynini.lib import byte, pynutil, utf8 NEMO_CHAR = utf8.VALID_UTF8_CHAR NEMO_DIGIT = byte.DIGIT NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() NEMO_HEX = pynini.union(*string.hexdigits).optimize() NEMO_NON_BREAKING_SPACE = u"\u00A0" NEMO_SPACE = " " NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n",