def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) # tagger accepted_symbols = [] with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) username = (pynutil.insert("username: \"") + NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) + pynutil.insert("\"") + pynini.cross('@', ' ')) domain_graph = ( NEMO_ALPHA + (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-') | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT)) domain_graph = pynutil.insert( "domain: \"") + domain_graph + pynutil.insert("\"") tagger_graph = (username + domain_graph).optimize() # verbalizer graph_digit = pynini.string_file( get_abs_path( "data/numbers/digits_nominative_case.tsv")).optimize() graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() user_name = ( pynutil.delete("username:"******"\"") + (pynini.closure( pynutil.add_weight(graph_digit + insert_space, 1.09) | pynutil.add_weight( pynini.closure(graph_symbols + pynutil.insert(" ")), 1.09) | pynutil.add_weight(NEMO_NOT_QUOTE + insert_space, 1.1))) + pynutil.delete("\"")) domain_default = (pynini.closure(NEMO_NOT_QUOTE + insert_space) + pynini.cross(".", "точка ") + NEMO_NOT_QUOTE + pynini.closure(insert_space + NEMO_NOT_QUOTE)) server_default = (pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1) + pynini.closure(graph_symbols + insert_space) + pynini.closure( (graph_digit | NEMO_ALPHA) + insert_space, 1)) server_common = pynini.string_file( get_abs_path("data/electronic/server_name.tsv")) + insert_space domain_common = pynini.cross(".", "точка ") + pynini.string_file( get_abs_path("data/electronic/domain.tsv")) domain = (pynutil.delete("domain:") + delete_space + pynutil.delete("\"") + (pynutil.add_weight(server_common, 1.09) | pynutil.add_weight(server_default, 1.1)) + (pynutil.add_weight(domain_common, 1.09) | pynutil.add_weight(domain_default, 1.1)) + delete_space + pynutil.delete("\"")) graph = user_name + delete_space + pynutil.insert( "собака ") + delete_space + domain + delete_space # replace all latin letters with their Ru verbalization verbalizer_graph = (graph.optimize() @ (pynini.closure( TO_LATIN | RU_ALPHA | pynini.accep(" ")))).optimize() verbalizer_graph = verbalizer_graph.optimize() self.final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens( pynutil.insert("username: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __construct_inflection(self): ''' Build the inflection cross ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.union( pynini.concat(pynini.cross("", "<Adj0>"), self.__adj0), pynini.concat(pynini.cross("", "<Adj0-Up>"), self.__adj0_up), pynini.concat(pynini.cross("", "<Adj+>"), self.__adj_plus), pynini.concat(pynini.cross("", "<Adj+e>"), self.__adj_plus_e), pynini.concat(pynini.cross("", "<NMasc_es_e>"), self.__nmasc_es_e), pynini.concat(pynini.cross("", "<NMasc_es_$e>"), self.__nmasc_es_e_ul), pynini.concat(pynini.cross("", "<NMasc_es_en>"), self.__nmasc_es_en), pynini.concat(pynini.cross("", "<NFem-Deriv>"), self.__nfem_deriv), pynini.concat(pynini.cross("", "<NFem_0_n>"), self.__nfem_0_n), pynini.concat(pynini.cross("", "<NNeut-Dimin>"), self.__nneut_dimin), pynini.concat(pynini.cross("", "<NNeut/Sg_s>"), self.__nneut_sg_s), pynini.concat(pynini.cross("", "<VVReg>"), self.__vv_reg)).optimize()
NEMO_NON_BREAKING_SPACE = u"\u00A0" NEMO_SPACE = " " NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") # French frequently compounds numbers with hyphen. delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1)) insert_hyphen = pynutil.insert("-") TO_LOWER = pynini.union(*[ pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase) ]) TO_UPPER = pynini.invert(TO_LOWER) PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): # Create placeholders NEMO_CHAR = None
def __init__(self): super().__init__(name="time", kind="classify") suffix_graph = pynini.string_file( get_abs_path("data/time/time_suffix.tsv")) time_to_graph = pynini.string_file( get_abs_path("data/time/time_to.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_twenties = pynini.string_file( get_abs_path("data/numbers/twenties.tsv")) graph_1_to_100 = pynini.union( graph_digit, graph_twenties, graph_teen, (graph_ties + pynutil.insert("0")), (graph_ties + pynutil.delete(" y ") + graph_digit), ) # note that graph_hour will start from 2 hours # "1 o'clock" will be treated differently because it # is singular digits_2_to_23 = [str(digits) for digits in range(2, 24)] digits_1_to_59 = [str(digits) for digits in range(1, 60)] graph_1oclock = pynini.cross("la una", "la 1") graph_hour = pynini.cross( "las ", "las ") + graph_1_to_100 @ pynini.union(*digits_2_to_23) graph_minute = graph_1_to_100 @ pynini.union(*digits_1_to_59) graph_minute_verbose = pynini.cross("media", "30") | pynini.cross( "cuarto", "15") final_graph_hour = pynutil.insert("hours: \"") + ( graph_1oclock | graph_hour) + pynutil.insert("\"") final_graph_minute = (pynutil.insert("minutes: \"") + pynini.closure( (pynutil.delete("y") | pynutil.delete("con")) + delete_space, 0, 1) + (graph_minute | graph_minute_verbose) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) # las nueve a eme (only convert on-the-hour times if they are followed by a suffix) graph_hsuffix = (final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"00\"") + insert_space + final_suffix) # las nueve y veinticinco graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # un cuarto para las cinco graph_mh = (pynutil.insert("minutes: \"") + pynini.union( pynini.cross("un cuarto para", "45"), pynini.cross("cuarto para", "45"), ) + pynutil.insert("\"") + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"")) # las diez menos diez graph_time_to = (pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"") + delete_extra_space + pynutil.insert("minutes: \"") + delete_space + pynutil.delete("menos") + delete_space + pynini.union( pynini.cross("cinco", "55"), pynini.cross("diez", "50"), pynini.cross("cuarto", "45"), pynini.cross("veinte", "40"), pynini.cross("veinticinco", "30"), ) + pynutil.insert("\"")) final_graph = pynini.union( (graph_hm | graph_mh | graph_time_to) + final_suffix_optional, graph_hsuffix).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency, style(depr) cardinal_graph = CardinalFst().graph_no_exception graph_decimal_final = DecimalFst().final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = ( pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight(((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"") ) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = ( pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix) ) graph_integer |= ( pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix) ) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert("currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph if not deterministic: cardinal_graph |= cardinal.range_graph graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit |= pynini.compose( pynini.closure(TO_LOWER, 1) + pynini.closure(NEMO_ALPHA), graph_unit) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph_unit2 = pynini.cross("/", "per") + delete_space + pynutil.insert( NEMO_NON_BREAKING_SPACE) + graph_unit optional_graph_unit2 = pynini.closure( delete_space + pynutil.insert(NEMO_NON_BREAKING_SPACE) + graph_unit2, 0, 1, ) unit_plural = ( pynutil.insert("units: \"") + (graph_unit_plural + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) unit_singular = (pynutil.insert("units: \"") + (graph_unit + optional_graph_unit2 | graph_unit2) + pynutil.insert("\"")) subgraph_decimal = (pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + delete_space + pynutil.insert(" } ") + unit_plural) subgraph_cardinal = (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_plural) subgraph_cardinal |= (pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("1", "one") + delete_space + pynutil.insert("\"") + pynutil.insert(" } ") + unit_singular) cardinal_dash_alpha = (pynutil.insert("cardinal { integer: \"") + cardinal_graph + pynini.accep('-') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) alpha_dash_cardinal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph + pynutil.insert("\" } preserve_order: true")) decimal_dash_alpha = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"")) decimal_times = (pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } units: \"") + pynini.cross(pynini.union('x', "X"), 'x') + pynutil.insert("\"")) alpha_dash_decimal = (pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynini.accep('-') + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative + pynutil.insert(" } preserve_order: true")) subgraph_fraction = (pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural) address = self.get_address_graph(cardinal) address = ( pynutil.insert("units: \"address\" cardinal { integer: \"") + address + pynutil.insert("\" } preserve_order: true")) math_operations = pynini.string_file( get_abs_path("data/math_operations.tsv")) delimiter = pynini.accep(" ") | pynutil.insert(" ") math = (cardinal_graph + delimiter + math_operations + delimiter + cardinal_graph + delimiter + pynini.cross("=", "equals") + delimiter + cardinal_graph) math = (pynutil.insert("units: \"math\" cardinal { integer: \"") + math + pynutil.insert("\" } preserve_order: true")) final_graph = (subgraph_decimal | subgraph_cardinal | cardinal_dash_alpha | alpha_dash_cardinal | decimal_dash_alpha | decimal_times | alpha_dash_decimal | subgraph_fraction | address | math) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph ordinal_graph = ordinal.graph # 2-10 are all ordinals three_to_ten = pynini.string_map([ "2", "3", "4", "5", "6", "7", "8", "9", "10", ]) block_three_to_ten = pynutil.delete( three_to_ten) # To block cardinal productions if not deterministic: # Multiples of tens are sometimes rendered as ordinals three_to_ten |= pynini.string_map([ "20", "30", "40", "50", "60", "70", "80", "90", ]) graph_three_to_ten = three_to_ten @ ordinal_graph graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", NEMO_SIGMA) # Higher powers of tens (and multiples) are converted to ordinals. hundreds = pynini.string_map([ "100", "200", "300", "400", "500", "600", "700", "800", "900", ]) graph_hundreds = hundreds @ ordinal_graph multiples_of_thousand = ordinal.multiples_of_thousand # So we can have X milésimos graph_higher_powers_of_ten = ( pynini.closure(ordinal.one_to_one_thousand + NEMO_SPACE, 0, 1) + pynini.closure("mil ", 0, 1) + pynini.closure(ordinal.one_to_one_thousand + NEMO_SPACE, 0, 1) ) # x millones / x mil millones / x mil z millones graph_higher_powers_of_ten += higher_powers_of_ten graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten graph_higher_powers_of_ten @= pynini.cdrewrite( pynutil.delete("un "), pynini.accep("[BOS]"), pynini.project(higher_powers_of_ten, "output"), NEMO_SIGMA ) # we drop 'un' from these ordinals (millionths, not one-millionths) graph_higher_powers_of_ten = multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten block_higher_powers_of_ten = pynutil.delete( pynini.project(graph_higher_powers_of_ten, "input")) # For cardinal graph graph_fractions_ordinals = graph_higher_powers_of_ten | graph_three_to_ten graph_fractions_ordinals += pynutil.insert( "\" morphosyntactic_features: \"ordinal\"" ) # We note the root for processing later # Blocking the digits and hundreds from Cardinal graph graph_fractions_cardinals = pynini.cdrewrite( block_three_to_ten | block_higher_powers_of_ten, pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA) graph_fractions_cardinals @= NEMO_CHAR.plus @ pynini.cdrewrite( pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), NEMO_SIGMA ) # Empty characters become '0' for NEMO_CHAR fst, so need to block graph_fractions_cardinals @= cardinal_graph graph_fractions_cardinals += pynutil.insert( "\" morphosyntactic_features: \"add_root\"" ) # blocking these entries to reduce erroneous possibilities in debugging if deterministic: graph_fractions_cardinals = ( pynini.closure(NEMO_DIGIT, 1, 2) @ graph_fractions_cardinals ) # Past hundreds the conventional scheme can be hard to read. For determinism we stop here graph_denominator = pynini.union( graph_fractions_ordinals, graph_fractions_cardinals, pynutil.add_weight(cardinal_graph + pynutil.insert("\""), 0.001), ) # Last form is simply recording the cardinal. Weighting so last resort integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert( "\"") + NEMO_SPACE numerator = (pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))) denominator = pynutil.insert("denominator: \"") + graph_denominator self.graph = pynini.closure(integer, 0, 1) + numerator + denominator final_graph = self.add_tokens(self.graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="electronic", kind="classify") delete_extra_space = pynutil.delete(" ") alpha_num = ( NEMO_ALPHA | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) ) symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert() accepted_username = alpha_num | symbols process_dot = pynini.cross("dot", ".") username = ( pynutil.insert("username: \"") + alpha_num + pynini.closure(delete_extra_space + accepted_username) + pynutil.insert("\"") ) single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv")) domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv")) domain_graph = ( pynutil.insert("domain: \"") + server + delete_extra_space + process_dot + delete_extra_space + domain + pynutil.insert("\"") ) graph = username + delete_extra_space + pynutil.delete("at") + insert_space + delete_extra_space + domain_graph ############# url ### protocol_end = pynini.cross(pynini.union("w w w", "www"), "www") protocol_start = (pynini.cross("h t t p", "http") | pynini.cross("h t t p s", "https")) + pynini.cross( " colon slash slash ", "://" ) # .com, ending = ( delete_extra_space + symbols + delete_extra_space + (domain | pynini.closure(accepted_username + delete_extra_space,) + accepted_username) ) protocol = ( pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + pynini.closure(delete_extra_space + accepted_username, 1) + pynini.closure(ending, 1) ) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"") graph |= protocol ######## final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv")) time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv"))) time_to_graph = pynini.string_file(get_abs_path("data/time/time_to.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15") oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert("hours: \"") + graph_hour + pynutil.insert("\"") graph_minute = ( oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double ) final_suffix = pynutil.insert("suffix: \"") + convert_space(suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = ( final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + graph_minute + pynutil.insert("\"") ) # 10 past four, quarter past four, half past four graph_mh = ( pynutil.insert("minutes: \"") + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour ) graph_quarter_time = ( pynutil.insert("minutes: \"") + pynini.cross("quarter", "45") + pynutil.insert("\"") + delete_space + pynutil.delete(pynini.union("to", "till")) + delete_extra_space + pynutil.insert("hours: \"") + time_to_graph + pynutil.insert("\"") ) graph_h = ( final_graph_hour + delete_extra_space + pynutil.insert("minutes: \"") + (pynutil.insert("00") | graph_minute) + pynutil.insert("\"") + delete_space + insert_space + final_suffix + final_time_zone_optional ) final_graph = (graph_hm | graph_mh | graph_quarter_time) + final_suffix_optional + final_time_zone_optional final_graph |= graph_h final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize() NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() NEMO_PUNCT = pynini.union( *map(pynini.escape, string.punctuation)).optimize() NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) delete_zero_or_one_space = pynutil.delete( pynini.closure(NEMO_WHITE_SPACE, 0, 1)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") delete_preserve_order = pynini.closure( pynutil.delete(" preserve_order: true") | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))) suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) # _v = pynini.union("a", "e", "i", "o", "u") _c = pynini.union("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z") _ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") _es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") _s = NEMO_SIGMA + pynutil.insert("s") graph_plural = plurals._priority_union(
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.cardinal_numbers decimal_graph = decimal.final_graph unit_singular = pynini.string_file( get_abs_path("data/currency/currency_singular.tsv")) unit_plural = pynini.string_file( get_abs_path("data/currency/currency_plural.tsv")) # adding weight to make sure the space is preserved for ITN optional_delimiter = pynini.closure( pynutil.add_weight(pynini.cross(NEMO_SPACE, ""), -100), 0, 1) graph_unit_singular = (optional_delimiter + pynutil.insert(" currency: \"") + unit_singular + pynutil.insert("\"")) graph_unit_plural = optional_delimiter + pynutil.insert( " currency: \"") + unit_plural + pynutil.insert("\"") one = pynini.compose(pynini.accep("1"), cardinal_graph).optimize() singular_graph = pynutil.insert( "integer_part: \"") + one + pynutil.insert( "\"") + graph_unit_singular graph_decimal = decimal_graph + graph_unit_plural graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"") + (graph_unit_plural)) graph_integer |= singular_graph tagger_graph = (graph_integer.optimize() | graph_decimal.optimize()).optimize() # verbalizer integer = pynutil.delete("\"") + pynini.closure( NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") integer_part = pynutil.delete("integer_part: ") + integer unit = (pynutil.delete("currency: ") + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) unit = pynini.accep(NEMO_SPACE) + unit verbalizer_graph_cardinal = (integer_part + unit).optimize() fractional_part = pynutil.delete("fractional_part: ") + integer optional_quantity = pynini.closure( pynini.accep(NEMO_SPACE) + pynutil.delete("quantity: ") + integer, 0, 1) verbalizer_graph_decimal = (pynutil.delete('decimal { ') + integer_part + pynini.accep(" ") + fractional_part + optional_quantity + pynutil.delete(" }") + unit) verbalizer_graph = (verbalizer_graph_cardinal | verbalizer_graph_decimal).optimize() self.final_graph = (tagger_graph @ verbalizer_graph).optimize() self.fst = self.add_tokens(self.final_graph).optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") left_context = pynini.union("[BOS]",NEMO_SPACE) right_context = pynini.union(NEMO_SPACE,"[EOS]") tr_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) tr_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) tr_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) tr_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) tr_one_to_two_digit tr_remove_hundred = pynini.cross("hundred", "") tr_hundred = tr_digit + delete_space + tr_remove_hundred tr_number = pynini.union(tr_zero, tr_digit, tr_ties, tr_teen) graph = pynini.cdrewrite(tr_number, left_context, right_context, NEMO_SIGMA) labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph self.graph_hundred_component_at_least_one_none_zero_digit = "" self.graph_no_exception = rw_number self.fst = rw_number.optimize() labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = ( pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph ) self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # TODO repalce to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) self.range_graph = pynutil.insert( "from ") + self.graph + pynini.cross("-", " to ") + self.graph self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph |= (pynutil.insert("from ") + get_hundreds_graph() + pynini.cross("-", " to ") + get_hundreds_graph()) self.range_graph = self.range_graph.optimize() serial_graph = self.get_serial_graph() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | serial_graph | pynutil.add_weight( long_numbers, -0.001) cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph)) final_graph = (self.graph | serial_graph | self.range_graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight( single_digits_graph_with_commas, 0.001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True, lm: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.lm = lm self.deterministic = deterministic # TODO replace to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/number/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) graph = (pynini.closure(NEMO_DIGIT, 1, 3) + (pynini.closure(pynutil.delete(",") + NEMO_DIGIT**3) | pynini.closure(NEMO_DIGIT**3))) @ graph self.graph = graph self.graph_with_and = self.add_optional_and(graph) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = plurals._priority_union(long_numbers, self.graph_with_and, NEMO_SIGMA).optimize() cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose( pynini.closure(NEMO_DIGIT), self.graph_with_and)) # add small weight to non-default graphs to make sure the deterministic option is listed first final_graph = ( self.graph_with_and | pynutil.add_weight(self.single_digits_graph, 0.0001) | get_four_digit_year_graph( ) # allows e.g. 4567 be pronouced as forty five sixty seven | pynutil.add_weight(single_digits_graph_with_commas, 0.0001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _get_digits_graph(): zero = pynini.cross((pynini.accep("oh") | pynini.accep("o")), "0") graph = zero + delete_space + graph_digit graph.optimize() return graph
def __init__(self, syms): with pynini.default_token_type(syms.alphabet): # store alphabet self.__syms = syms # delete initial features del_initial_features = pynini.cross("", syms.initial_features) # delete categories del_cat_ext = pynini.cross( "", pynini.union(syms.categories, syms.disjunctive_categories)) # delete stem types del_stem_types = pynini.cross("", syms.stem_types) # delete prefix/suffix marker del_prefix_suffix_marker = pynini.cross("", syms.prefix_suffix_marker) # insert prefix/suffix marker insert_prefix_suffix_marker = pynini.cross( syms.prefix_suffix_marker, "") # delete stem type features del_stem_type_feats = pynini.cross("", syms.stem_type_features) # delete origin features del_origin_feats = pynini.cross("", syms.origin_features) # delete complexity agreement features del_complexity_agreement_feats = pynini.cross( "", syms.complexity_agreement_features) # delete word complexity features del_complex_lex_entries = pynini.cross( "", syms.complexity_entry_features) # insert word complexity features insert_complex_lex_entries = pynini.cross( syms.complexity_entry_features, "") # inflection classes del_infl_classes = pynini.cross("", syms.inflection_classes) # disjunctive features disjunctive_feat_list = [ "<CARD,DIGCARD,NE>", "<ADJ,CARD>", "<ADJ,NN>", "<CARD,NN>", "<CARD,NE>", "<ABK,ADJ,NE,NN>", "<ADJ,NE,NN>", "<ABK,NE,NN>", "<NE,NN>", "<ABK,CARD,NN>", "<ABK,NN>", "<ADJ,CARD,NN,V>", "<ADJ,NN,V>", "<ABK,ADJ,NE,NN,V>", "<ADJ,NE,NN,V>", "<ADV,NE,NN,V>", "<ABK,NE,NN,V>", "<NE,NN,V>", "<ABK,NN,V>", "<NN,V>", "<frei,fremd,gebunden>", "<frei,fremd,gebunden,kurz>", "<frei,fremd,gebunden,lang>", "<fremd,gebunden,lang>", "<frei,fremd,kurz>", "<frei,fremd,lang>", "<frei,gebunden>", "<frei,gebunden,kurz,lang>", "<frei,gebunden,lang>", "<frei,lang>", "<klassisch,nativ>", "<fremd,klassisch,nativ>", "<fremd,klassisch>", "<frei,nativ>", "<frei,fremd,nativ>", "<fremd,nativ>", "<nativ,prefnativ>", "<frei,nativ,prefnativ>", "<komposit,prefderiv,simplex,suffderiv>", "<prefderiv,suffderiv>", "<komposit,prefderiv,simplex>", "<komposit,simplex,suffderiv>", "<komposit,simplex>", "<prefderiv,simplex,suffderiv>", "<prefderiv,simplex>", "<simplex,suffderiv>" ] disjunctive_feats = pynini.string_map( disjunctive_feat_list).project("input").optimize() del_disjunctive_feats = pynini.cross("", disjunctive_feats) # short cut: map_helper1 map_helper1 = pynini.union( syms.characters, pynini.accep("<FB>"), pynini.accep("<SS>"), pynini.cross("e", "<Ge-Nom>"), pynini.cross("n", "<n>"), pynini.cross("e", "<e>"), pynini.cross("d", "<d>"), pynini.cross("", "<~n>"), pynini.cross("", "<UL>"), del_stem_types, syms.prefix_suffix_marker, del_stem_type_feats, pynini.cross("", "<ge>"), del_origin_feats, del_complexity_agreement_feats, del_complex_lex_entries, del_infl_classes, del_disjunctive_feats, ).closure().optimize() # short cut: map_helper2 map_helper2 = pynini.concat( map_helper1, pynini.concat( pynini.concat( syms.characters, pynini.union( pynini.union(syms.characters, pynini.accep("<SUFF>"), pynini.accep("<CONV>")), syms.categories).closure(), ).closure(0, 1), map_helper1)).optimize() # self.__map1 = pynini.concat( del_initial_features.closure(), pynini.concat( pynini.union( pynini.concat( pynini.cross( "", pynini.string_map( ["<Base_Stems>", "<Pref_Stems>"]).project("input")), pynini.concat(map_helper2, del_cat_ext)), pynini.concat( pynini.cross( "", pynini.string_map( ["<Deriv_Stems>", "<Kompos_Stems>"]).project("input")), pynini.concat(map_helper2, syms.categories)), pynini.cross("", "<Pref_Stems>") + map_helper1 + del_cat_ext, pynini.cross("", "<Suff_Stems>") + map_helper1 + del_cat_ext + map_helper1 + syms.categories + pynini.cross("", "<base>"), pynini.cross("", "<Suff_Stems>") + map_helper1 + del_cat_ext + pynini.concat(map_helper1, del_cat_ext + pynini.accep("<SUFF>")).closure(1) + pynini.cross("", "<base>"), pynini.cross("", "<Suff_Stems>") + map_helper1 + del_cat_ext + pynini.concat(map_helper1, syms.categories + pynini.accep("<SUFF>")).closure(1) + pynini.cross( "", pynini.string_map(["<deriv>", "<kompos>" ]).project("input"))), map_helper1, )).optimize() split_origin_features = pynini.union( pynini.cross( "<NGeo-0-$er-$er>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-$er-NMasc_s_0>", "<NGeo-$er-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-0-$er-$isch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-$er-NMasc_s_0>", "<NGeo-$isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-aner-aner>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-aner-NMasc_s_0>", "<NGeo-aner-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-0-aner-anisch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-aner-NMasc_s_0>", "<NGeo-anisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-e-isch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-er-er>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-er-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-0-0-0>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-0-NMasc_s_0>", "<NGeo-0-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-0-er-erisch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-erisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-er-isch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-ese-esisch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-ese-NMasc_n_n>", "<NGeo-esisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-ianer-ianisch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-ianer-NMasc_s_0>", "<NGeo-ianisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-ner-isch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-ner-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0-ner-nisch>", pynini.string_map([ "<NGeo-0-Name-Neut_s>", "<NGeo-ner-NMasc_s_0>", "<NGeo-nisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0fem-er-erisch>", pynini.string_map([ "<NGeo-0-Name-Fem_0>", "<NGeo-er-NMasc_s_0>", "<NGeo-erisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0masc-er-isch>", pynini.string_map([ "<NGeo-0-Name-Masc_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-0masc-ese-esisch>", pynini.string_map([ "<NGeo-0-Name-Masc_s>", "<NGeo-ese-NMasc_n_n>", "<NGeo-esisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-a-er-isch>", pynini.string_map([ "<NGeo-a-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-a-ese-esisch>", pynini.string_map([ "<NGeo-a-Name-Neut_s>", "<NGeo-ese-NMasc_n_n>", "<NGeo-esisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-afem-er-isch>", pynini.string_map([ "<NGeo-a-Name-Fem_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-e-er-er>", pynini.string_map([ "<NGeo-e-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-er-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-e-er-isch>", pynini.string_map([ "<NGeo-e-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-efem-er-isch>", pynini.string_map([ "<NGeo-e-Name-Fem_0>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-ei-e-isch>", pynini.string_map([ "<NGeo-ei-Name-Fem_0>", "<NGeo-e-NMasc_n_n>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-en-aner-anisch>", pynini.string_map([ "<NGeo-en-Name-Neut_s>", "<NGeo-aner-NMasc_s_0>", "<NGeo-anisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-en-e-$isch>", pynini.string_map([ "<NGeo-en-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-$isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-en-e-isch>", pynini.string_map([ "<NGeo-en-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-en-er-er>", pynini.string_map([ "<NGeo-en-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-er-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-en-er-isch>", pynini.string_map([ "<NGeo-en-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-ien-e-isch>", pynini.string_map([ "<NGeo-ien-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-ien-er-isch>", pynini.string_map([ "<NGeo-ien-Name-Neut_s>", "<NGeo-er-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-ien-ese-esisch>", pynini.string_map([ "<NGeo-ien-Name-Neut_s>", "<NGeo-ese-NMasc_n_n>", "<NGeo-esisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-ien-ianer-ianisch>", pynini.string_map([ "<NGeo-ien-Name-Neut_s>", "<NGeo-ianer-NMasc_s_0>", "<NGeo-ianisch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-ien-ier-isch>", pynini.string_map([ "<NGeo-ien-Name-Neut_s>", "<NGeo-ier-NMasc_s_0>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-istan-e-isch>", pynini.string_map([ "<NGeo-istan-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-land-$er-$er>", pynini.string_map([ "<NGeo-land-Name-Neut_s>", "<NGeo-$er-NMasc_s_0>", "<NGeo-$er-Adj0-Up>" ]).project("input")), pynini.cross( "<NGeo-land-e-isch>", pynini.string_map([ "<NGeo-land-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-isch-Adj+>" ]).project("input")), pynini.cross( "<NGeo-land-e-nisch>", pynini.string_map([ "<NGeo-land-Name-Neut_s>", "<NGeo-e-NMasc_n_n>", "<NGeo-nisch-Adj+>" ]).project("input"))).optimize() map_helper3 = pynini.union( syms.characters, syms.circumfix_features, syms.initial_features, syms.stem_types, syms.categories, insert_prefix_suffix_marker, syms.stem_type_features, syms.origin_features, syms.complexity_agreement_features, insert_complex_lex_entries, syms.inflection_classes, self.__split_disjunctive_feats(disjunctive_feat_list), split_origin_features).optimize() self.__map2 = pynini.concat( map_helper3.closure(), pynini.concat( pynini.cross("e", "<e>"), pynini.concat( pynini.string_map(["l", "r"]).project("input"), pynini.concat( pynini.string_map( ["<ADJ>", "<NE>", "<NN>", "<V>"]).project("input").closure(0, 1), pynini.concat( pynini.accep("<V>"), pynini.concat( pynini.string_map([ "<SUFF>", "<CONV>" ]).project("input").closure(0, 1), pynini.concat( pynini.accep("<base> <nativ>"), pynini.concat( insert_complex_lex_entries.closure( 0, 1), pynini.accep("<VVReg-el/er>"))))))) ).closure(0, 1)).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components zero = pynini.cross("0", "zero") if not deterministic: zero |= pynini.cross("0", pynini.union("o", "oh")) digit = pynini.invert(pynini.string_file(get_abs_path("data/number/digit.tsv"))).optimize() | zero telephone_prompts = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv")) country_code = ( pynini.closure(telephone_prompts + delete_extra_space, 0, 1) + pynini.closure(pynini.cross("+", "plus "), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert(",") ) country_code |= telephone_prompts country_code = pynutil.insert("country_code: \"") + country_code + pynutil.insert("\"") country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = pynini.cross("800", "eight hundred") | pynini.compose( pynini.difference(NEMO_SIGMA, "800"), area_part_default ) area_part = ( (area_part + (pynutil.delete("-") | pynutil.delete("."))) | ( pynutil.delete("(") + area_part + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-")) ) ) + add_separator del_separator = pynini.closure(pynini.union("-", " ", "."), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator)) ** 7 number_words = pynini.closure( (NEMO_DIGIT @ digit) + (insert_space | (pynini.cross("-", ', '))) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' ')) ) number_words |= pynini.closure( (NEMO_DIGIT @ digit) + (insert_space | (pynini.cross(".", ', '))) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross(".", ' ')) ) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"") extension = ( pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"") ) extension = pynini.closure(insert_space + extension, 0, 1) graph = plurals._priority_union(country_code + number_part, number_part, NEMO_SIGMA).optimize() graph = plurals._priority_union(country_code + number_part + extension, graph, NEMO_SIGMA).optimize() graph = plurals._priority_union(number_part + extension, graph, NEMO_SIGMA).optimize() # ip ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv")) digit_to_str_graph = digit + pynini.closure(pynutil.insert(" ") + digit, 0, 2) ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph) ** 3 graph |= ( pynini.closure( pynutil.insert("country_code: \"") + ip_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1 ) + pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") ) # ssn ssn_prompts = pynini.string_file(get_abs_path("data/telephone/ssn_prompt.tsv")) three_digit_part = digit + (pynutil.insert(" ") + digit) ** 2 two_digit_part = digit + pynutil.insert(" ") + digit four_digit_part = digit + (pynutil.insert(" ") + digit) ** 3 ssn_separator = pynini.cross("-", ", ") ssn_graph = three_digit_part + ssn_separator + two_digit_part + ssn_separator + four_digit_part graph |= ( pynini.closure( pynutil.insert("country_code: \"") + ssn_prompts + pynutil.insert("\"") + delete_extra_space, 0, 1 ) + pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") ) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph_with_and cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) self.graph = cardinal.single_digits_graph.optimize() if not deterministic: self.graph = self.graph | cardinal_graph point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") self.graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + self.graph_fractional) quantity_w_abbr = get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit, include_abbr=True) quantity_wo_abbr = get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit, include_abbr=False) self.final_graph_wo_negative_w_abbr = final_graph_wo_sign | quantity_w_abbr self.final_graph_wo_negative = final_graph_wo_sign | quantity_wo_abbr # reduce options for non_deterministic and allow either "oh" or "zero", but not combination if not deterministic: no_oh_zero = pynini.difference( NEMO_SIGMA, (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA) | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA), ).optimize() no_zero_oh = pynini.difference( NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA + pynini.accep("oh") + NEMO_SIGMA).optimize() self.final_graph_wo_negative |= pynini.compose( self.final_graph_wo_negative, pynini.cdrewrite( pynini.cross("integer_part: \"zero\"", "integer_part: \"oh\""), NEMO_SIGMA, NEMO_SIGMA, NEMO_SIGMA), ) self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_oh_zero).optimize() self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_zero_oh).optimize() final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") # integer, negative graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundred = pynini.cross("hundred", "") graph_hundred_component = pynini.union( graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) graph_trillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) graph_quadrillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) graph_quintillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) graph_sextillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0") labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ graph self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph graph_decimal_final = decimal.final_graph_wo_negative maj_singular_labels = load_labels( get_abs_path("data/currency/currency.tsv")) maj_unit_plural = convert_space(maj_singular @ SINGULAR_TO_PLURAL) maj_unit_singular = convert_space(maj_singular) graph_maj_singular = pynutil.insert( "currency_maj: \"") + maj_unit_singular + pynutil.insert("\"") graph_maj_plural = pynutil.insert( "currency_maj: \"") + maj_unit_plural + pynutil.insert("\"") optional_delete_fractional_zeros = pynini.closure( pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1) graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross( "1", "one") + pynutil.insert("\"") # only for decimals where third decimal after comma is non-zero or with quantity decimal_delete_last_zeros = ( pynini.closure(NEMO_DIGIT | pynutil.delete(",")) + pynini.accep(".") + pynini.closure(NEMO_DIGIT, 2) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) decimal_with_quantity = NEMO_SIGMA + NEMO_ALPHA graph_decimal = (graph_maj_plural + insert_space + (decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert("\"")) graph_integer_only = graph_maj_singular + insert_space + graph_integer_one graph_integer_only |= graph_maj_plural + insert_space + graph_integer final_graph = (graph_integer_only + optional_delete_fractional_zeros) | graph_decimal # remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits # e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10 # not accepted: 002, 00, 0, two_digits_fractional_part = ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ ( (pynutil.delete("0") + (NEMO_DIGIT - "0")) | ((NEMO_DIGIT - "0") + pynutil.insert("0")) | ((NEMO_DIGIT - "0") + NEMO_DIGIT)) graph_min_singular = pynutil.insert( " currency_min: \"") + min_singular + pynutil.insert("\"") graph_min_plural = pynutil.insert( " currency_min: \"") + min_plural + pynutil.insert("\"") # format ** dollars ** cent decimal_graph_with_minor = None integer_graph_reordered = None decimal_default_reordered = None for curr_symbol, _ in maj_singular_labels: preserve_order = pynutil.insert(" preserve_order: true") integer_plus_maj = graph_integer + insert_space + pynutil.insert( curr_symbol) @ graph_maj_plural integer_plus_maj |= graph_integer_one + insert_space + pynutil.insert( curr_symbol) @ graph_maj_singular integer_plus_maj_with_comma = pynini.compose( NEMO_DIGIT - "0" + pynini.closure(NEMO_DIGIT | pynutil.delete(",")), integer_plus_maj) integer_plus_maj = pynini.compose( pynini.closure(NEMO_DIGIT) - "0", integer_plus_maj) integer_plus_maj |= integer_plus_maj_with_comma graph_fractional_one = two_digits_fractional_part @ pynini.cross( "1", "one") graph_fractional_one = pynutil.insert( "fractional_part: \"") + graph_fractional_one + pynutil.insert( "\"") graph_fractional = (two_digits_fractional_part @ ( pynini.closure(NEMO_DIGIT, 1, 2) - "1" ) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit) graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional + pynutil.insert( "\"") fractional_plus_min = graph_fractional + insert_space + pynutil.insert( curr_symbol) @ graph_min_plural fractional_plus_min |= ( graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular) decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross( ".", " ") + fractional_plus_min if not deterministic: decimal_graph_with_minor_curr |= pynutil.add_weight( integer_plus_maj + pynini.cross(".", " ") + pynutil.insert("fractional_part: \"") + two_digits_fractional_part @ cardinal. graph_hundred_component_at_least_one_none_zero_digit + pynutil.insert("\""), weight=0.0001, ) default_fraction_graph = ( decimal_delete_last_zeros | decimal_with_quantity) @ graph_decimal_final decimal_graph_with_minor_curr |= ( pynini.closure(pynutil.delete("0"), 0, 1) + pynutil.delete(".") + fractional_plus_min) decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) + decimal_graph_with_minor_curr + preserve_order) decimal_graph_with_minor = ( decimal_graph_with_minor_curr if decimal_graph_with_minor is None else pynini.union( decimal_graph_with_minor, decimal_graph_with_minor_curr).optimize()) if not deterministic: integer_graph_reordered_curr = (pynutil.delete(curr_symbol) + integer_plus_maj + preserve_order).optimize() integer_graph_reordered = ( integer_graph_reordered_curr if integer_graph_reordered is None else pynini.union( integer_graph_reordered, integer_graph_reordered_curr).optimize()) decimal_default_reordered_curr = ( pynutil.delete(curr_symbol) + default_fraction_graph + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural) decimal_default_reordered = ( decimal_default_reordered_curr if decimal_default_reordered is None else pynini.union( decimal_default_reordered, decimal_default_reordered_curr)).optimize() # weight for SH final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.001) if not deterministic: final_graph |= integer_graph_reordered | decimal_default_reordered # to handle "$2.00" cases final_graph |= pynini.compose( NEMO_SIGMA + pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), integer_graph_reordered) final_graph = self.add_tokens(final_graph.optimize()) self.fst = final_graph.optimize()
def __init__(self, number_names: dict, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) increment_hour_ordinal = pynini.string_file( get_abs_path("data/time/increment_hour_ordinal.tsv")) increment_hour_cardinal = pynini.string_file( get_abs_path("data/time/increment_hour_cardinal.tsv")) convert_hour = pynini.string_file( get_abs_path("data/time/time_convert.tsv")) number = pynini.closure(pynini.cross("0", ""), 0, 1) + number_names['cardinal_names_nominative'] hour_options = pynini.project(increment_hour_ordinal, "input") hour_options = hour_options | pynini.project(convert_hour, "output") hour_exeption_ends_with_one = pynini.union(*["01", "21"]) hour_exeption_ends_rest = pynini.union(*["02", "03", "04", "22", "23"]) hour_other = (pynini.difference( hour_options, pynini.union(hour_exeption_ends_with_one, hour_exeption_ends_rest))).optimize() hour = hour_exeption_ends_with_one @ number + pynutil.insert(" час") hour |= hour_exeption_ends_rest @ number + pynutil.insert(" часа") hour |= hour_other @ number + pynutil.insert(" часов") optional_and = pynini.closure(pynutil.insert("и "), 0, 1) digits = pynini.union(*[str(x) for x in range(10)]) mins_start = pynini.union(*"012345") mins_options = mins_start + digits mins_exception_ends_with_one = mins_start + pynini.accep("1") mins_exception_ends_rest = pynini.difference( mins_start + pynini.union(*"234"), pynini.union(*["12", "13", "14"])) mins_other = pynini.difference( mins_options, pynini.union(mins_exception_ends_with_one, mins_exception_ends_rest)) minutes = mins_exception_ends_with_one @ number + pynutil.insert( " минута") minutes |= mins_exception_ends_rest @ number + pynutil.insert( " минуты") minutes |= mins_other @ number + pynutil.insert(" минут") self.minutes = minutes.optimize() # 17:15 -> "семнадцать часов и пятнадцать минут" hm = (pynutil.insert("hours: \"") + hour.optimize() + pynutil.insert("\"") + (pynini.cross(":", " ") + pynutil.insert("minutes: \"") + optional_and + minutes.optimize()) + pynutil.insert("\"") + pynutil.insert(" preserve_order: true")) h = pynutil.insert("hours: \"") + hour + pynutil.insert( "\"") + pynutil.delete(":00") self.graph_preserve_order = (hm | h).optimize() # 17:15 -> "пятнадцать минут шестого" # Requires permutations for the correct verbalization self.increment_hour_ordinal = pynini.compose( hour_options, increment_hour_ordinal).optimize() m_next_h = (pynutil.insert("hours: \"") + self.increment_hour_ordinal + pynutil.insert("\"") + pynini.cross(":", " ") + pynutil.insert("minutes: \"") + minutes + pynutil.insert("\"")) # 17:45 -> "без пятнадцати минут шесть" # Requires permutations for the correct verbalization self.mins_to_h = pynini.string_file( get_abs_path("data/time/minutes_to_hour.tsv")).optimize() self.increment_hour_cardinal = pynini.compose( hour_options, increment_hour_cardinal).optimize() m_to_h = (pynutil.insert("hours: \"") + self.increment_hour_cardinal + pynutil.insert("\"") + pynini.cross(":", " ") + pynutil.insert("minutes: \"без ") + self.mins_to_h + pynutil.insert("\"")) self.final_graph = m_next_h | self.graph_preserve_order | m_to_h self.fst = self.add_tokens(self.final_graph) self.fst = self.fst.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph = (pynini.invert(graph_digit | graph_zero) | pynini.cross("0", "oh") | pynini.cross("0", "o")) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph = (self.graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight(single_digits_graph_with_commas, 0.001)) self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | self.get_serial_graph( ) | pynutil.add_weight(long_numbers, -0.001) if not deterministic: final_graph |= self.range_graph remove_leading_zeros = pynini.closure( pynutil.delete("0"), 1) + pynini.compose( pynini.closure(NEMO_DIGIT, 1), self.graph) final_graph |= remove_leading_zeros final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
"""Portuguese g2p rules.""" import pynini from pynini.lib import rewrite # Gets all characters in language English chars = ([chr(i) for i in range(65, 90)] + [chr(i) for i in range(97, 123)] + ["ʎ", "ʃ", "ɲ", "ç", "á", "ʁ", "ɾ", "ʒ", "ch", "lh", "nh", "ss"]) SIGMA_STAR = pynini.string_map(chars).closure() # Portugese rule set given G2P = (pynini.cdrewrite( pynini.union( pynini.cross("ch", "ʃ"), pynini.cross("lh", "ʎ"), pynini.cross("nh", "ɲ"), ), "", "", SIGMA_STAR, ) @ pynini.cdrewrite(pynini.cross( "h", ""), "", "", SIGMA_STAR) @ pynini.cdrewrite( pynini.cross("o", "u"), "", pynini.union("[EOS]", pynini.accep("s[EOS]"), "r"), SIGMA_STAR, ) @ pynini.cdrewrite( pynini.cross("e", "i"), "", pynini.union("[EOS]", pynini.accep("s[EOS]")), SIGMA_STAR,
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = NEMO_DIGIT**( 1, 2) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit address_num += insert_space + NEMO_DIGIT**2 @ ( pynini.closure(pynini.cross("0", "zero "), 0, 1) + cardinal.graph_hundred_component_at_least_one_none_zero_digit) # to handle the rest of the numbers address_num = pynini.compose(NEMO_DIGIT**(3, 4), address_num) address_num = plurals._priority_union(address_num, cardinal.graph, NEMO_SIGMA) direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) + pynini.closure( pynutil.delete("."), 0, 1) direction = pynini.closure(pynini.accep(NEMO_SPACE) + direction, 0, 1) address_words = get_formats( get_abs_path("data/address/address_word.tsv")) address_words = ( pynini.accep(NEMO_SPACE) + (pynini.closure(ordinal_num, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + NEMO_SPACE + pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) states = load_labels(get_abs_path("data/address/state.tsv")) additional_options = [] for x, y in states: additional_options.append((x, f"{y[0]}.{y[1:]}")) states.extend(additional_options) state_graph = pynini.string_map(states) state = pynini.invert(state_graph) state = pynini.closure( pynini.accep(",") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, 0, 1, ) address = address_num + direction + address_words + pynini.closure( city + state + zip_code, 0, 1) address |= address_num + direction + address_words + pynini.closure( pynini.cross(".", ""), 0, 1) return address
def __init__(self, syms): # # store alphabet # self.__syms = syms with pynini.default_token_type(self.__syms.alphabet): # # case markers # fix = pynini.cross("", "<Fix#>") adj = pynini.cross("", "<Low#>") adj_up = pynini.cross("", "<Up#>") n = pynini.cross("", "<Up#>") n_low = pynini.cross("", "<Low#>") v = pynini.cross("", "<Low#>") closed = pynini.cross("", "<Low#>") closed_up = pynini.cross("", "<Up#>") # # inflection classes # # # abbreviations abk_ADJ = pynini.concat(pynini.cross("<^ABK> <+ADJ>", ""), adj) abk_ADV = pynini.concat(pynini.cross("<^ABK> <+ADV>", ""), closed) abk_ART = pynini.concat(pynini.cross("<^ABK> <+ART>", ""), closed) abk_DPRO = pynini.concat(pynini.cross("<^ABK> <+DEMPRO>", ""), closed) abk_KONJ = pynini.concat(pynini.cross("<^ABK> <+KONJ>", ""), closed) abk_NE = pynini.concat(pynini.cross("<^ABK> <+NE>", ""), n) abk_NE_Low = pynini.concat(pynini.cross("<^ABK> <+NE>", ""), n_low) abk_NN = pynini.concat(pynini.cross("<^ABK> <+NN>", ""), n) abk_NN_Low = pynini.concat(pynini.cross("<^ABK> <+NN>", ""), n_low) abk_PREP = pynini.concat(pynini.cross("<^ABK> <+PREP>", ""), closed) abk_VPPAST = pynini.concat( pynini.cross("<^ABK> <^VPPAST> <+ADJ>", ""), adj) abk_VPPRES = pynini.concat( pynini.cross("<^ABK> <^VPPRES> <+ADJ>", ""), adj) # # adjectives # invariant adjectives self.__adj0 = pynini.concat(pynini.cross("<+ADJ> <Invar>", ""), adj) self.__adj0_up = pynini.concat(pynini.cross("<+ADJ> <Invar>", ""), adj_up) # inflectional endings adj_flex_suff = pynini.union( pynini.concat( pynini.cross("<Masc> <Nom> <Sg> <St/Mix>", "e r"), adj), pynini.concat(pynini.cross("<Masc> <Nom> <Sg> <Sw>", "e"), adj), pynini.concat(pynini.cross("<Masc> <Gen> <Sg>", "e n"), adj), pynini.concat(pynini.cross("<Masc> <Dat> <Sg> <St>", "e m"), adj), pynini.concat( pynini.cross("<Masc> <Dat> <Sg> <Sw/Mix>", "e n"), adj), pynini.concat(pynini.cross("<Masc> <Akk> <Sg>", "e n"), adj), pynini.concat(pynini.cross("<Fem> <Nom> <Sg>", "e"), adj), pynini.concat(pynini.cross("<Fem> <Gen> <Sg> <St>", "e r"), adj), pynini.concat(pynini.cross("<Fem> <Gen> <Sg> <Sw/Mix>", "e n"), adj), pynini.concat(pynini.cross("<Fem> <Dat> <Sg> <St>", "e r"), adj), pynini.concat(pynini.cross("<Fem> <Dat> <Sg> <Sw/Mix>", "e n"), adj), pynini.concat(pynini.cross("<Fem> <Akk> <Sg>", "e"), adj), pynini.concat( pynini.cross("<Neut> <Nom> <Sg> <St/Mix>", "e s"), adj), pynini.concat(pynini.cross("<Neut> <Nom> <Sg> <Sw>", "e"), adj), pynini.concat(pynini.cross("<Neut> <Gen> <Sg>", "e n"), adj), pynini.concat(pynini.cross("<Neut> <Dat> <Sg> <St>", "e m"), adj), pynini.concat( pynini.cross("<Neut> <Dat> <Sg> <Sw/Mix>", "e n"), adj), pynini.concat( pynini.cross("<Neut> <Akk> <Sg> <St/Mix>", "e s"), adj), pynini.concat(pynini.cross("<Neut> <Akk> <Sg> <Sw>", "e"), adj), pynini.concat(pynini.cross("<NoGend> <Nom> <Pl> <St>", "e"), adj), pynini.concat( pynini.cross("<NoGend> <Nom> <Pl> <Sw/Mix>", "e n"), adj), pynini.concat( pynini.cross("<NoGend> <Gen> <Pl> <Sw/Mix>", "e n"), adj), pynini.concat(pynini.cross("<NoGend> <Gen> <Pl> <St>", "e r"), adj), pynini.concat(pynini.cross("<NoGend> <Dat> <Pl>", "e n"), adj), pynini.concat( pynini.cross("<NoGend> <Akk> <Pl> <Sw/Mix>", "e n"), adj), pynini.concat(pynini.cross("<NoGend> <Akk> <Pl> <St>", "e"), adj)).optimize() # inflectional endings for nominalization adj_nn_suff = pynini.union( pynini.concat( pynini.cross("<+NN> <Masc> <Nom> <Sg> <St/Mix>", "e r"), n), pynini.concat( pynini.cross("<+NN> <Masc> <Nom> <Sg> <Sw>", "e"), n), pynini.concat(pynini.cross("<+NN> <Masc> <Gen> <Sg>", "e n"), n), pynini.concat( pynini.cross("<+NN> <Masc> <Dat> <Sg> <St>", "e m"), n), pynini.concat( pynini.cross("<+NN> <Masc> <Dat> <Sg> <Sw/Mix>", "e n"), n), pynini.concat(pynini.cross("<+NN> <Masc> <Akk> <Sg>", "e n"), n), pynini.concat(pynini.cross("<+NN> <Fem> <Nom> <Sg>", "e"), n), pynini.concat( pynini.cross("<+NN> <Fem> <Gen> <Sg> <St>", "e r"), n), pynini.concat( pynini.cross("<+NN> <Fem> <Gen> <Sg> <Sw/Mix>", "e n"), n), pynini.concat( pynini.cross("<+NN> <Fem> <Dat> <Sg> <St>", "e r"), n), pynini.concat( pynini.cross("<+NN> <Fem> <Dat> <Sg> <Sw/Mix>", "e n"), n), pynini.concat(pynini.cross("<+NN> <Fem> <Akk> <Sg>", "e"), n), pynini.concat( pynini.cross("<+NN> <Neut> <Nom> <Sg> <St/Mix>", "e s"), n), pynini.concat( pynini.cross("<+NN> <Neut> <Nom> <Sg> <Sw>", "e"), n), pynini.concat(pynini.cross("<+NN> <Neut> <Gen> <Sg>", "e n"), n), pynini.concat( pynini.cross("<+NN> <Neut> <Dat> <Sg> <St>", "e m"), n), pynini.concat( pynini.cross("<+NN> <Neut> <Dat> <Sg> <Sw/Mix>", "e n"), n), pynini.concat( pynini.cross("<+NN> <Neut> <Akk> <Sg> <St/Mix>", "e s"), n), pynini.concat( pynini.cross("<+NN> <Neut> <Akk> <Sg> <Sw>", "e"), n), pynini.concat( pynini.cross("<+NN> <NoGend> <Nom> <Pl> <St>", "e"), n), pynini.concat( pynini.cross("<+NN> <NoGend> <Nom> <Pl> <Sw/Mix>", "e n"), n), pynini.concat( pynini.cross("<+NN> <NoGend> <Gen> <Pl> <Sw/Mix>", "e n"), n), pynini.concat( pynini.cross("<+NN> <NoGend> <Gen> <Pl> <St>", "e r"), n), pynini.concat(pynini.cross("<+NN> <NoGend> <Dat> <Pl>", "e n"), n), pynini.concat( pynini.cross("<+NN> <NoGend> <Akk> <Pl> <Sw/Mix>", "e n"), n), pynini.concat( pynini.cross("<+NN> <NoGend> <Akk> <Pl> <St>", "e"), n)).optimize() # positive adj_pos = pynini.union( pynini.concat(pynini.cross("<+ADJ> <Pos> <Pred>", ""), adj), pynini.concat(pynini.cross("<+ADJ> <Pos> <Adv>", ""), adj), pynini.concat(pynini.cross("<+ADJ> <Pos>", ""), adj_flex_suff), ).optimize() adj_pos_attr = pynini.concat(pynini.cross("<+ADJ> <Pos>", "<FB>"), adj_flex_suff).optimize() adj_pos_pred = pynini.concat( pynini.cross("<+ADJ> <Pos> <Pred>", ""), adj) # superlative adj_sup = pynini.union( pynini.concat(pynini.cross("<+ADJ> <Sup> <Pred>", "s t e n"), adj), pynini.concat(pynini.cross("<+ADJ> <Sup> <Pred>", "s t"), adj), pynini.concat(pynini.cross("<+ADJ> <Sup> <Adv>", "s t e n"), adj), pynini.concat(pynini.cross("<+ADJ> <Sup>", "s t"), adj_flex_suff), ).optimize() # superlative with e adj_sup_e = pynini.union( pynini.concat(pynini.cross("<+ADJ> <Sup> <Pred>", "e s t e n"), adj), pynini.concat(pynini.cross("<+ADJ> <Sup> <Pred>", "e s t"), adj), pynini.concat(pynini.cross("<+ADJ> <Sup> <Adv>", "e s t e n"), adj), pynini.concat(pynini.cross("<+ADJ> <Sup>", "e s t"), adj_flex_suff), ).optimize() # comparative adj_comp = pynini.union( pynini.concat(pynini.cross("<+ADJ> <Comp> <Pred>", "e r"), adj), pynini.concat(pynini.cross("<+ADJ> <Comp> <Adv>", "e r"), adj), pynini.concat(pynini.cross("<+ADJ> <Comp>", "e r"), adj_flex_suff), ).optimize() # inflection classes (?) adj_nn = adj_pos_pred self.__adj_plus = pynini.union( pynini.concat(pynini.cross("", "<FB>"), adj_pos), pynini.concat(pynini.cross("", "<FB>"), adj_comp), pynini.concat(pynini.cross("", "<FB>"), adj_sup)).optimize() self.__adj_plus_e = pynini.union( pynini.concat(pynini.cross("", "<FB>"), adj_pos), pynini.concat(pynini.cross("", "<FB>"), adj_comp), pynini.concat(pynini.cross("", "<FB>"), adj_sup_e)).optimize() adj_pos_sup = pynini.union( pynini.concat(pynini.cross("", "<FB>"), adj_pos_attr), pynini.concat(pynini.cross("", "<FB>"), adj_sup)).optimize() adj_umlaut = pynini.union( pynini.concat(pynini.cross("", "<FB>"), adj_pos), pynini.concat(pynini.cross("", "<UL>"), adj_comp), pynini.concat(pynini.cross("", "<UL>"), adj_sup)).optimize() adj_umlaut_e = pynini.union( pynini.concat(pynini.cross("", "<FB>"), adj_pos), pynini.concat(pynini.cross("", "<UL>"), adj_comp), pynini.concat(pynini.cross("", "<UL> e"), adj_sup)).optimize() adj_ss_e = pynini.union( pynini.concat(pynini.cross("", "<SS> <FB>"), adj_pos), pynini.concat(pynini.cross("", "<SS> <FB>"), adj_comp), pynini.concat(pynini.cross("", "<SS> <FB> e"), adj_sup)).optimize() # # nouns # # # inflection classes # # inflection endings: atomic # Frau; Mythos; Chaos n_sg_0 = pynini.union( pynini.concat(pynini.cross("<Nom> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Gen> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Dat> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Akk> <Sg>", "<FB>"), n)).optimize() # Opa-s, Klima-s n_sg_s = pynini.union( pynini.concat(pynini.cross("<Nom> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Gen> <Sg>", "<FB> s"), n), pynini.concat(pynini.cross("<Dat> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Akk> <Sg>", "<FB>"), n)).optimize() # Haus-es, Geist-(e)s n_sg_es = pynini.union( pynini.concat(pynini.cross("<Nom> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Gen> <Sg>", "<FB> e s <^Gen>"), n), pynini.concat(pynini.cross("<Dat> <Sg>", "<FB>"), n), pynini.concat(pynini.cross("<Dat> <Sg>", "<FB> e"), n), pynini.concat(pynini.cross("<Akk> <Sg>", "<FB>"), n)).optimize() n_pl_0 = pynini.union( pynini.concat(pynini.cross("<Nom> <Pl>", ""), n), pynini.concat(pynini.cross("<Gen> <Pl>", ""), n), pynini.concat(pynini.cross("<Dat> <Pl>", "n"), n), pynini.concat(pynini.cross("<Akk> <Pl>", ""), n)).optimize() n_pl_x = pynini.union( pynini.concat(pynini.cross("<Nom> <Pl>", ""), n), pynini.concat(pynini.cross("<Gen> <Pl>", ""), n), pynini.concat(pynini.cross("<Dat> <Pl>", ""), n), pynini.concat(pynini.cross("<Akk> <Pl>", ""), n)).optimize() # # inflection endings: meta n_es_e = pynini.union( n_sg_es, pynini.concat(pynini.cross("", "<FB> e"), n_pl_0)) n_es_e_ul = pynini.union( n_sg_es, pynini.concat(pynini.cross("", "<UL> e"), n_pl_0)) n_es_en = pynini.union( n_sg_es, pynini.concat(pynini.cross("", "<FB> e n"), n_pl_x)) n_0_en = pynini.union( n_sg_0, pynini.concat(pynini.cross("", "<FB> e n"), n_pl_x)) n_0_n = pynini.union( n_sg_0, pynini.concat(pynini.cross("", "<FB> n"), n_pl_x)) n_s_x = pynini.union(n_sg_s, n_pl_x) # NMasc_es_e: Tag-(e)s/Tage self.__nmasc_es_e = pynini.concat(pynini.cross("<+NN> <Masc>", ""), n_es_e).optimize() # NMasc_es_e$: Arzt-(e)s/Ärzte self.__nmasc_es_e_ul = pynini.concat( pynini.cross("<+NN> <Masc>", ""), n_es_e_ul).optimize() # NMasc_es_en: Fleck-(e)s/Flecken self.__nmasc_es_en = pynini.concat( pynini.cross("<+NN> <Masc>", ""), n_es_en).optimize() # NFem-Deriv self.__nfem_deriv = pynini.concat(pynini.cross("<+NN> <Fem>", ""), n_0_en).optimize() # NFem_0_n: Kammer/Kammern self.__nfem_0_n = pynini.concat(pynini.cross("<+NN> <Fem>", ""), n_0_n).optimize() # NNeut-Dimin: Mäuschen-s/Mäuschen self.__nneut_dimin = pynini.concat( pynini.cross("<+NN> <Neut>", ""), n_s_x).optimize() # NNeut/Sg_s: Abitur-s/-- self.__nneut_sg_s = pynini.concat(pynini.cross("<+NN> <Neut>", ""), n_sg_s).optimize() # # verbs # # # inflection endings: atomic # bin's v_plus_es = pynini.cross("/ \' s", "\' s").closure(0, 1) + v # (ich) lerne v_pres_reg_1 = pynini.concat( pynini.cross("<+V> <1> <Sg> <Pres> <Ind>", "<FB> e"), v_plus_es).optimize() # (du) lernst v_pres_reg_2 = pynini.concat( pynini.cross("<+V> <2> <Sg> <Pres> <Ind>", "<DEL-S> s t"), v_plus_es).optimize() # (er/sie/es) lernt v_pres_reg_3 = pynini.concat( pynini.cross("<+V> <3> <Sg> <Pres> <Ind>", "<DEL-S> t"), v_plus_es).optimize() # (wir/ihr/sie) lernen v_pres_pl_ind = pynini.concat( pynini.union( pynini.cross("<+V> <1> <Pl> <Pres> <Ind>", "<FB> e n"), pynini.cross("<+V> <2> <Pl> <Pres> <Ind>", "<DEL-S> t"), pynini.cross("<+V> <3> <Pl> <Pres> <Ind>", "<FB> e n")), v_plus_es).optimize() # (ich/du/sie/wir/ihr/sie) lernen v_pres_subj = pynini.concat( pynini.union( pynini.cross("<+V> <1> <Sg> <Pres> <Konj>", "<FB> e"), pynini.cross("<+V> <2> <Sg> <Pres> <Konj>", "<FB> e s t"), pynini.cross("<+V> <3> <Sg> <Pres> <Konj>", "<FB> e"), pynini.cross("<+V> <1> <Pl> <Pres> <Konj>", "<FB> e n"), pynini.cross("<+V> <2> <Pl> <Pres> <Konj>", "<FB> e t"), pynini.cross("<+V> <3> <Pl> <Pres> <Konj>", "<FB> e n")), v_plus_es).optimize() # (ich/du/sie/wir/ihr/sie) lernten v_past_ind_reg = pynini.concat( pynini.union( pynini.cross("<+V> <1> <Sg> <Past> <Ind>", "<DEL-S> t e"), pynini.cross("<+V> <2> <Sg> <Past> <Ind>", "<DEL-S> t e s t"), pynini.cross("<+V> <3> <Sg> <Past> <Ind>", "<DEL-S> t e"), pynini.cross("<+V> <1> <Pl> <Past> <Ind>", "<DEL-S> t e n"), pynini.cross("<+V> <2> <Pl> <Past> <Ind>", "<DEL-S> t e t"), pynini.cross("<+V> <3> <Pl> <Past> <Ind>", "<DEL-S> t e n")), v_plus_es).optimize() # (wir/ihr/sie) lernten v_past_subj_reg = pynini.concat( pynini.union( pynini.cross("<+V> <1> <Sg> <Past> <Konj>", "<DEL-S> t e"), pynini.cross("<+V> <2> <Sg> <Past> <Konj>", "<DEL-S> t e s t"), pynini.cross("<+V> <3> <Sg> <Past> <Konj>", "<DEL-S> t e"), pynini.cross("<+V> <1> <Pl> <Past> <Konj>", "<DEL-S> t e n"), pynini.cross("<+V> <2> <Pl> <Past> <Konj>", "<DEL-S> t e t"), pynini.cross("<+V> <3> <Pl> <Past> <Konj>", "<DEL-S> t e n")), v_plus_es).optimize() # kommt, schaut! v_imp_pl = pynini.concat( pynini.cross("<+V> <Imp> <Pl>", "<DEL-S> t <^imp>"), v_plus_es).optimize() # kommt, schaut! v_imp_sg = pynini.concat( pynini.cross("<+V> <Imp> <Sg>", "<DEL-S> <^imp>"), v_plus_es).optimize() # SMOR: investigate Lernen<+NN> v_inf = pynini.union( pynini.union(pynini.cross("<+V> <Inf>", ""), pynini.cross("<+V> <Inf> <zu>", "<^zz>")) + v, pynini.cross("<V> <CONV>", "") + self.__nneut_sg_s, ) # SMOR: investigate lernendes<+ADJ> v_ppres = pynini.union(pynini.cross("<+V> <PPres>", ""), pynini.cross("<+V> <PPres> <zu>", "<^zz>")) + v # SMOR: investigate gelerntes<+ADJ> v_ppast = pynini.cross("<+V> <PPast>", "<^pp>") + v # lernend v_inf_plus_ppres = pynini.union( v_inf, pynini.concat(pynini.cross("", "d"), v_ppres)).optimize() # lernen v_inf_stem = pynini.concat(pynini.cross("", "<FB> e n"), v_inf_plus_ppres).optimize() # gelernt v_pp_t = pynini.concat(pynini.cross("", "<DEL-S> t"), v_ppast).optimize() # # inflection endings: meta v_flex_pres_1 = pynini.union(v_pres_reg_1, v_pres_pl_ind, v_pres_subj, v_imp_pl, v_inf_stem).optimize() v_flex_pres_reg = pynini.union(v_flex_pres_1, v_pres_reg_2, v_pres_reg_3, v_imp_sg).optimize() v_flex_reg = pynini.union(v_flex_pres_reg, v_past_ind_reg, v_past_subj_reg, v_pp_t).optimize() # # inflection classes # VVReg: lernen self.__vv_reg = pynini.concat(pynini.cross("e n", ""), v_flex_reg).optimize() # # building the inflection cross # self.__inflection = self.__construct_inflection() # # definition of a filter which enforces the correct inflection # self.__inflection_filter = self.__construct_inflection_filter()
def __init__(self): super().__init__(name="telephone", kind="classify") delete_space = pynutil.delete(' ') # country code, number_part, extension add_separator = pynutil.insert(" ") # between components digit = pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv"))).optimize() | pynini.cross( "0", pynini.union("o", "oh", "zero") ) number_part = ( ( (pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-")) | ( pynutil.delete("(") + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space ) ) + add_separator + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete("-") + add_separator + pynini.closure(digit + insert_space, 3, 3) + digit ) number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"") graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __construct_inflection_filter(self): ''' Define a filter which enforces the correct inflection ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map([ "<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<UL>", "<SS>", "<FB>", "<DEL-S>", "<ge>", "<no-ge>", "<^imp>", "<^zz>", "<^pp>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>", "<Fix#>", "<Low#>", "<Up#>" ])).project("input").closure() return pynini.concat( pynini.union( pynini.concat(pynini.cross("<Adj0>", ""), pynini.cross("<Adj0>", "")), pynini.concat(pynini.cross("<Adj0-Up>", ""), pynini.cross("<Adj0-Up>", "")), pynini.concat(pynini.cross("<Adj+>", ""), pynini.cross("<Adj+>", "")), pynini.concat(pynini.cross("<Adj+e>", ""), pynini.cross("<Adj+e>", "")), pynini.concat(pynini.cross("<NMasc_es_e>", ""), pynini.cross("<NMasc_es_e>", "")), pynini.concat(pynini.cross("<NMasc_es_$e>", ""), pynini.cross("<NMasc_es_$e>", "")), pynini.concat(pynini.cross("<NMasc_es_en>", ""), pynini.cross("<NMasc_es_en>", "")), pynini.concat(pynini.cross("<NFem-Deriv>", ""), pynini.cross("<NFem-Deriv>", "")), pynini.concat(pynini.cross("<NFem_0_n>", ""), pynini.cross("<NFem_0_n>", "")), pynini.concat(pynini.cross("<NNeut-Dimin>", ""), pynini.cross("<NNeut-Dimin>", "")), pynini.concat(pynini.cross("<NNeut/Sg_s>", ""), pynini.cross("<NNeut/Sg_s>", "")), pynini.concat(pynini.cross("<VVReg>", ""), pynini.cross("<VVReg>", ""))), alphabet).optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_one = pynini.cross("mốt", "1") graph_four = pynini.cross("tư", "4") graph_five = pynini.cross("lăm", "5") graph_half = pynini.cross("rưỡi", "5") graph_hundred = pynini.cross("trăm", "") graph_ten = pynini.cross("mươi", "") zero = pynini.cross(pynini.union("linh", "lẻ"), "0") optional_ten = pynini.closure(delete_space + graph_ten, 0, 1) last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input") last_digit = pynini.union( (pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit, graph_one, graph_four, graph_five, ) graph_hundred_ties_component = ( graph_digit | graph_zero) + delete_space + graph_hundred graph_hundred_ties_component += delete_space graph_hundred_ties_component += pynini.union( graph_teen, (graph_half | graph_four | graph_one) + pynutil.insert("0"), graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")), zero + delete_space + (graph_digit | graph_four), pynutil.insert("00"), ) graph_hundred_ties_component |= ( pynutil.insert("0") + delete_space + pynini.union( graph_teen, graph_ties + optional_ten + delete_space + last_digit, graph_ties + delete_space + graph_ten + pynutil.insert("0"), zero + delete_space + (graph_digit | graph_four), )) graph_hundred_component = graph_hundred_ties_component | ( pynutil.insert("00") + delete_space + graph_digit) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit) graph_hundred_ties_zero = graph_hundred_ties_component | pynutil.insert( "000") graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")), pynutil.insert("000", weight=0.1), ) graph_ten_thousand = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"), pynutil.insert("0000", weight=0.1), ) graph_ten_thousand_suffix = pynini.union( graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")), pynutil.insert("0", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("tỉ", "tỷ")), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_ties_zero, graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_ties_zero, graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) + delete_space + (((last_digit | graph_half) + pynutil.insert("00")) | graph_hundred_ties_zero), graph_digit, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0", ) # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input") self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), '"-"') + NEMO_SPACE, 0, 1, ) final_graph = optional_minus_graph + pynutil.insert( 'integer: "') + self.graph + pynutil.insert('"') final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _get_digits_graph(): zero = pynini.cross((pynini.accep("linh") | pynini.accep("lẻ")), "0") four = pynini.cross("tư", "4") graph = pynini.union(zero + delete_space + (graph_digit | four), graph_zero + delete_space + graph_digit) graph.optimize() return graph
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) delete_time_delimiter = pynutil.delete(pynini.union(".", ":")) one = pynini.string_map([("un", "una"), ("ún", "una")]) change_one = pynini.cdrewrite(one, "", "", NEMO_SIGMA) cardinal_graph = cardinal.graph @ change_one day_suffix = pynutil.insert("suffix: \"") + suffix + pynutil.insert( "\"") day_suffix = delete_space + insert_space + day_suffix delete_hora_suffix = delete_space + insert_space + pynutil.delete("h") delete_minute_suffix = delete_space + insert_space + pynutil.delete( "min") delete_second_suffix = delete_space + insert_space + pynutil.delete( "s") labels_hour_24 = [ str(x) for x in range(0, 25) ] # Can see both systems. Twelve hour requires am/pm for ambiguity resolution labels_hour_12 = [str(x) for x in range(1, 13)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = ( pynini.closure(pynutil.delete("0") | (NEMO_DIGIT - "0"), 0, 1) + NEMO_DIGIT) graph_24 = (pynini.closure(NEMO_DIGIT, 1, 2) @ delete_leading_zero_to_double_digit @ pynini.union(*labels_hour_24)) graph_12 = (pynini.closure(NEMO_DIGIT, 1, 2) @ delete_leading_zero_to_double_digit @ pynini.union(*labels_hour_12)) graph_hour_24 = graph_24 @ cardinal_graph graph_hour_12 = graph_12 @ cardinal_graph graph_minute_single = delete_leading_zero_to_double_digit @ pynini.union( *labels_minute_single) graph_minute_double = pynini.union(*labels_minute_double) graph_minute = pynini.union(graph_minute_single, graph_minute_double) @ cardinal_graph final_graph_hour_only_24 = (pynutil.insert("hours: \"") + graph_hour_24 + pynutil.insert("\"") + delete_hora_suffix) final_graph_hour_only_12 = pynutil.insert( "hours: \"") + graph_hour_12 + pynutil.insert("\"") + day_suffix final_graph_hour_24 = pynutil.insert( "hours: \"") + graph_hour_24 + pynutil.insert("\"") final_graph_hour_12 = pynutil.insert( "hours: \"") + graph_hour_12 + pynutil.insert("\"") final_graph_minute = pynutil.insert( "minutes: \"") + graph_minute + pynutil.insert("\"") final_graph_second = pynutil.insert( "seconds: \"") + graph_minute + pynutil.insert("\"") final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + time_zone_graph + pynutil.insert("\""), 0, 1, ) # 02.30 h graph_hm = ( final_graph_hour_24 + delete_time_delimiter + (pynutil.delete("00") | (insert_space + final_graph_minute)) + pynini.closure( delete_time_delimiter + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)), 0, 1, ) # For seconds 2.30.35 h + pynini.closure(delete_hora_suffix, 0, 1) # 2.30 is valid if unambiguous + final_time_zone_optional) # 2 h 30 min graph_hm |= ( final_graph_hour_24 + delete_hora_suffix + delete_space + (pynutil.delete("00") | (insert_space + final_graph_minute)) + delete_minute_suffix + pynini.closure( delete_space + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)) + delete_second_suffix, 0, 1, ) # For seconds + final_time_zone_optional) # 2.30 a. m. (Only for 12 hour clock) graph_hm |= ( final_graph_hour_12 + delete_time_delimiter + (pynutil.delete("00") | (insert_space + final_graph_minute)) + pynini.closure( delete_time_delimiter + (pynini.cross("00", " seconds: \"0\"") | (insert_space + final_graph_second)), 0, 1, ) # For seconds 2.30.35 a. m. + day_suffix + final_time_zone_optional) graph_h = ( pynini.union(final_graph_hour_only_24, final_graph_hour_only_12) + final_time_zone_optional ) # Should always have a time indicator, else we'll pass to cardinals if not deterministic: # This includes alternate vocalization (hour menos min, min para hour), here we shift the times and indicate a `style` tag hour_shift_24 = pynini.invert( pynini.string_file(get_abs_path("data/time/hour_to_24.tsv"))) hour_shift_12 = pynini.invert( pynini.string_file(get_abs_path("data/time/hour_to_12.tsv"))) minute_shift = pynini.string_file( get_abs_path("data/time/minute_to.tsv")) graph_hour_to_24 = graph_24 @ hour_shift_24 @ cardinal_graph graph_hour_to_12 = graph_12 @ hour_shift_12 @ cardinal_graph graph_minute_to = pynini.union( graph_minute_single, graph_minute_double) @ minute_shift @ cardinal_graph final_graph_hour_to_24 = pynutil.insert( "hours: \"") + graph_hour_to_24 + pynutil.insert("\"") final_graph_hour_to_12 = pynutil.insert( "hours: \"") + graph_hour_to_12 + pynutil.insert("\"") final_graph_minute_to = pynutil.insert( "minutes: \"") + graph_minute_to + pynutil.insert("\"") graph_menos = pynutil.insert(" style: \"1\"") graph_para = pynutil.insert(" style: \"2\"") final_graph_style = graph_menos | graph_para # 02.30 h (omitting seconds since a bit awkward) graph_hm |= ( final_graph_hour_to_24 + delete_time_delimiter + insert_space + final_graph_minute_to + pynini.closure( delete_hora_suffix, 0, 1) # 2.30 is valid if unambiguous + final_time_zone_optional + final_graph_style) # 2 h 30 min graph_hm |= (final_graph_hour_to_24 + delete_hora_suffix + delete_space + insert_space + final_graph_minute_to + delete_minute_suffix + final_time_zone_optional + final_graph_style) # 2.30 a. m. (Only for 12 hour clock) graph_hm |= (final_graph_hour_to_12 + delete_time_delimiter + insert_space + final_graph_minute_to + day_suffix + final_time_zone_optional + final_graph_style) final_graph = graph_hm | graph_h if deterministic: final_graph = final_graph + pynutil.insert(" preserve_order: true") final_graph = final_graph.optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()