def __init__(self, deterministic: bool = True): super().__init__(name="roman", kind="verbalize", deterministic=deterministic) suffix = OrdinalFst().suffix cardinal = pynini.closure(NEMO_NOT_QUOTE) ordinal = pynini.compose(cardinal, suffix) graph = (pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + pynini.accep(" ") + pynutil.delete("integer: \"") + cardinal + pynutil.delete("\"")).optimize() graph |= (pynutil.delete("default_cardinal: \"default\" integer: \"") + cardinal + pynutil.delete("\"")).optimize() graph |= (pynutil.delete("default_ordinal: \"default\" integer: \"") + ordinal + pynutil.delete("\"")).optimize() graph |= (pynutil.delete("key_the_ordinal: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + pynini.accep(" ") + pynutil.delete("integer: \"") + pynini.closure(pynutil.insert("the "), 0, 1) + ordinal + pynutil.delete("\"")).optimize() delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def __init__(self, tn_decimal, deterministic: bool = False): super().__init__(name="decimal", kind="classify", deterministic=deterministic) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("минус", "\"true\"") + delete_extra_space, 0, 1) graph_fractional_part = pynini.invert( tn_decimal.graph_fractional).optimize() graph_integer_part = pynini.invert(tn_decimal.integer_part).optimize() optional_graph_quantity = pynini.invert( tn_decimal.optional_quantity).optimize() graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional_part + pynutil.insert( "\"") graph_integer = pynutil.insert( "integer_part: \"") + graph_integer_part + pynutil.insert("\"") optional_graph_quantity = pynutil.insert( "quantity: \"") + optional_graph_quantity + pynutil.insert("\"") optional_graph_quantity = pynini.closure( pynini.accep(NEMO_SPACE) + optional_graph_quantity, 0, 1) self.final_graph_wo_sign = (graph_integer + pynini.accep(NEMO_SPACE) + graph_fractional + optional_graph_quantity) final_graph = optional_graph_negative + self.final_graph_wo_sign final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def load_lexicon(source, symbol_table): ''' Load lexica entries from source interpreting them using a given symbol table. ''' lex = pynini.Fst() lex.set_input_symbols(symbol_table) lex.set_output_symbols(symbol_table) # longest match, prefer complex over simple symbols tokenizer = re.compile("(<[^>]*>|.)(?::(<[^>]*>|.))?", re.U) for line in source: line = line.strip() if line: tmp = pynini.Fst() tmp.set_input_symbols(symbol_table) tmp.set_output_symbols(symbol_table) start = tmp.add_state() tmp.set_start(start) tmp.set_final(start) for token in tokenizer.findall(line): if token[1]: tmp1 = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table)) tmp2 = pynini.concat(tmp, pynini.accep(token[1], token_type=symbol_table)) tmp = pynini.concat(tmp, pynini.cross(tmp1, tmp2)) else: tmp = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table)) lex = pynini.union(lex, tmp) return lex
def __construct_compound_filter(self): ''' Construct the compound filter ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>"]).project("input"), self.__syms.stem_types, pynini.cross(self.__syms.categories, ""), pynini.cross(self.__syms.origin_features, ""), pynini.cross("<NoPref>", "") ) return pynini.concat( pynini.union( pynini.cross("<Initial>", ""), pynini.accep("<NoHy>"), pynini.accep("<NoDef>") ).closure(0,1), pynini.concat( pynini.union( pynini.concat( alphabet.closure(), pynini.cross(pynini.string_map(["<ABK>", "<ADV>", "<CARD>", "<NE>", "<PRO>", "<V>", "<ORD>", "<OTHER>"]).project("input"), "") ), pynini.concat( pynini.cross("", "<VADJ>"), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.concat( pynini.cross("<kompos>", ""), pynini.concat( alphabet.closure(), pynini.cross("<V>", "") ) ) ) ), pynini.concat( pynini.union( alphabet, pynini.cross("<kompos>", "") ).closure(), pynini.cross(pynini.string_map(["<ADJ>", "<NN>"]).project("input"), "") ) ), pynini.concat( pynini.cross("<base>", ""), pynini.concat( pynini.cross(self.__syms.origin_features, ""), self.__syms.inflection_classes ) ) ) ).optimize()
def __init__(self, ordinal: GraphFst, cardinal: GraphFst): super().__init__(name="date", kind="classify") self.cardinal = cardinal ordinal_graph = ordinal.graph year_graph = self._get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") day_graph = pynutil.insert("day: \"") + pynutil.add_weight( ordinal_graph, -0.7) + pynutil.insert("\"") optional_graph_year = pynini.closure( delete_extra_space + pynutil.insert("year: \"") + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\""), 0, 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_year = (pynutil.insert("year: \"") + year_graph + pynini.closure( pynini.accep('er') + pynini.closure(pynini.accep('n'), 0, 1), 0, 1) + pynutil.insert("\"")) final_graph = graph_dmy | graph_year final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") numerator = (pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))) endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] optional_end = pynini.closure(pynini.cross(pynini.union(*endings), ""), 0, 1) denominator = pynutil.insert( "denominator: \"" ) + cardinal_graph + optional_end + pynutil.insert("\"") graph = pynini.closure(integer + pynini.accep(" "), 0, 1) + (numerator + denominator) graph |= pynini.closure( integer + (pynini.accep(" ") | pynutil.insert(" ")), 0, 1) + pynini.compose( pynini.string_file(get_abs_path("data/number/fraction.tsv")), (numerator + denominator)) self.graph = graph final_graph = self.add_tokens(self.graph) self.fst = final_graph.optimize()
def get_year_graph(cardinal: GraphFst) -> 'pynini.FstLike': """ Returns year verbalizations as fst < 2000 neunzehn (hundert) (vier und zwanzig), >= 2000 regular cardinal **00 ** hundert Args: delete_leading_zero: removed leading zero cardinal: cardinal GraphFst """ year_gt_2000 = (pynini.union("21", "20") + NEMO_DIGIT**2) @ cardinal.graph graph_two_digit = delete_leading_zero @ cardinal.two_digit_non_zero hundred = pynutil.insert("hundert") graph_double_double = ((pynini.accep("1") + NEMO_DIGIT) @ graph_two_digit + insert_space + pynini.closure(hundred + insert_space, 0, 1) + graph_two_digit) # for 20** graph_double_double |= pynini.accep( "20") @ graph_two_digit + insert_space + graph_two_digit graph = (graph_double_double | (pynini.accep("1") + NEMO_DIGIT) @ graph_two_digit + insert_space + pynutil.delete("00") + hundred | year_gt_2000) return graph
def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph cardinal_format = pynini.closure(NEMO_DIGIT | pynini.accep(",")) st_format = (pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1) + pynini.accep("1") + pynutil.delete(pynini.union("st", "ST"))) nd_format = (pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1) + pynini.accep("2") + pynutil.delete(pynini.union("nd", "ND"))) rd_format = (pynini.closure(cardinal_format + (NEMO_DIGIT - "1"), 0, 1) + pynini.accep("3") + pynutil.delete(pynini.union("rd", "RD"))) th_format = pynini.closure( (NEMO_DIGIT - "1" - "2" - "3") | (cardinal_format + "1" + NEMO_DIGIT) | (cardinal_format + (NEMO_DIGIT - "1") + (NEMO_DIGIT - "1" - "2" - "3")), 1, ) + pynutil.delete(pynini.union("th", "TH")) self.graph = (st_format | nd_format | rd_format | th_format) @ cardinal_graph final_graph = pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __construct_insert_zu(self): ''' Inserts "zu" into infinitives with separable prefixes ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<~n>", "<e>", "<d>", "<NoHy>", "<NoDef>", "<VADJ>", "<CB>", "<FB>", "<UL>", "<SS>", "<DEL-S>", "<Low#>", "<Up#>", "<Fix#>", "<^imp>", "<^UC>", "<^Ax>", "<^pl>", "<^Gen>", "<^Del>"]).project("input") ).optimize() c2 = pynini.union( alphabet, self.__syms.stem_types ).closure().optimize() # From deko.fst: # insert "zu" after verbal prefixes if followed by infinitive marker return pynini.union( c2, #pynini.concat( # pynini.accep("<Base_Stems>"), # alphabet.closure(), # pynini.cross("<^zz>", ""), # alphabet.closure() # ), c2 + pynini.accep("<Pref_Stems>") + alphabet.closure() + pynini.accep("<Base_Stems>") + pynini.cross("", "z u") + alphabet.closure() + pynini.cross("<^zz>", "") + alphabet.closure() ).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) accepted_symbols = [] with open(get_abs_path("data/electronic/symbols.tsv"), 'r') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) username = ( pynutil.insert("username: \"") + NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) + pynutil.insert("\"") + pynini.cross('@', ' ') ) domain_graph = ( NEMO_ALPHA + (pynini.closure(NEMO_ALPHA | NEMO_DIGIT | pynini.accep('-') | pynini.accep('.'))) + (NEMO_ALPHA | NEMO_DIGIT) ) domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"") graph = username + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" punct_symbols_to_exclude = ["[", "]"] punct_unicode = [ chr(i) for i in range(sys.maxunicode) if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude ] whitelist_symbols = load_labels( get_abs_path("data/whitelist/symbol.tsv")) whitelist_symbols = [x[0] for x in whitelist_symbols] self.punct_marks = [ p for p in punct_unicode + list(s) if p not in whitelist_symbols ] punct = pynini.union(*self.punct_marks) punct = pynini.closure(punct, 1) emphasis = (pynini.accep("<") + ( (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1))) + pynini.accep(">")) punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) self.graph = punct self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__(self, tn_time: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) tn_time_tagger = tn_time.graph_preserve_order tn_time_verbalizer = TNTimeVerbalizer().graph tn_time_graph_preserve_order = pynini.compose( tn_time_tagger, tn_time_verbalizer).optimize() graph_preserve_order = pynini.invert( tn_time_graph_preserve_order).optimize() graph_preserve_order = pynutil.insert( "hours: \"") + graph_preserve_order + pynutil.insert("\"") # "пятнадцать минут шестого" -> 17:15 # Requires permutations for the correct verbalization m_next_h = (pynutil.insert("minutes: \"") + pynini.invert(tn_time.minutes).optimize() + pynutil.insert("\"") + pynini.accep(NEMO_SPACE) + pynutil.insert("hours: \"") + pynini.invert(tn_time.increment_hour_ordinal).optimize() + pynutil.insert("\"")).optimize() # "без пятнадцати минут шесть" -> 17:45 # Requires permutation for the correct verbalization m_to_h = (pynini.cross("без ", "minutes: \"") + pynini.invert(tn_time.mins_to_h) + pynutil.insert("\"") + pynini.accep(NEMO_SPACE) + pynutil.insert("hours: \"") + pynini.invert(tn_time.increment_hour_cardinal).optimize() + pynutil.insert("\"")) graph_reserve_order = m_next_h | m_to_h graph = graph_preserve_order | graph_reserve_order graph = self.add_tokens(graph) self.fst = graph.optimize()
def __construct_suff_phon(self): ''' ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>", "<NN>", "<ADJ>"]).project("input"), self.__syms.stem_types, ).closure() Tau = pynini.cross("i", "") Lambda = pynini.concat( pynini.union( pynini.accep("i"), pynini.concat( self.__syms.consonants.project("input"), pynini.accep("y") ) ), pynini.accep("<Suff_Stems>") ) return pynini.concat( pynini.cdrewrite( Tau, Lambda, "", alphabet.project("input") ), self.__tail ).optimize()
def __construct_compound_stems_nn(self, tmp): ''' Default noun compounding stems ''' with pynini.default_token_type(self.__syms.alphabet): kompos_stems = pynini.compose( pynini.concat( self.__syms.characters.closure(1), pynini.union( pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Sg>")))), pynini.cross( "", pynini.concat( pynini.accep("<+NN>"), pynini.concat(self.__syms.gender, pynini.accep("<Nom> <Pl>")))))), tmp) return (pynini.cross("", "<Kompos_Stems>") + kompos_stems + pynini.accep("<NN>") + pynini.cross("", "<kompos> <nativ>")).optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph cardinal_graph_hundred_component_at_least_one_none_zero_digit = ( cardinal.graph_hundred_component_at_least_one_none_zero_digit) self.graph = cardinal.single_digits_graph.optimize() if not deterministic: self.graph = self.graph | cardinal_graph point = pynutil.delete(".") optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) self.graph_fractional = pynutil.insert( "fractional_part: \"") + self.graph + pynutil.insert("\"") self.graph_integer = pynutil.insert( "integer_part: \"") + cardinal_graph + pynutil.insert("\"") final_graph_wo_sign = ( pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) + point + pynutil.insert(" ") + self.graph_fractional) self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( final_graph_wo_sign, cardinal_graph_hundred_component_at_least_one_none_zero_digit) # reduce options for non_deterministic and allow either "oh" or "zero", but not combination if not deterministic: no_oh_zero = pynini.difference( NEMO_SIGMA, (NEMO_SIGMA + "oh" + NEMO_SIGMA + "zero" + NEMO_SIGMA) | (NEMO_SIGMA + "zero" + NEMO_SIGMA + "oh" + NEMO_SIGMA), ).optimize() no_zero_oh = pynini.difference( NEMO_SIGMA, NEMO_SIGMA + pynini.accep("zero") + NEMO_SIGMA + pynini.accep("oh") + NEMO_SIGMA).optimize() self.final_graph_wo_negative |= pynini.compose( self.final_graph_wo_negative, pynini.cdrewrite( pynini.cross("integer_part: \"zero\"", "integer_part: \"oh\""), NEMO_SIGMA, NEMO_SIGMA, NEMO_SIGMA), ) self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_oh_zero).optimize() self.final_graph_wo_negative = pynini.compose( self.final_graph_wo_negative, no_zero_oh).optimize() final_graph = optional_graph_negative + self.final_graph_wo_negative final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = self.single_digits_graph if not self.deterministic: num_graph |= self.graph # add space between letter and digit graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA, NEMO_DIGIT, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), NEMO_DIGIT, NEMO_ALPHA, NEMO_SIGMA), ) # make sure at least one digit and letter is present not_space = pynini.closure(NEMO_NOT_SPACE) graph_with_space = pynini.compose( (not_space + NEMO_ALPHA + not_space + NEMO_DIGIT + not_space) | (not_space + NEMO_DIGIT + not_space + NEMO_ALPHA + not_space), graph_with_space, ) keep_space = pynini.accep(" ") serial_graph = pynini.compose( graph_with_space, pynini.closure(pynini.closure(NEMO_ALPHA, 1) + keep_space, 1) + num_graph + pynini.closure(keep_space + pynini.closure(NEMO_ALPHA) + pynini.closure(keep_space + num_graph, 0, 1)), ) serial_graph |= pynini.compose( graph_with_space, num_graph + keep_space + pynini.closure(NEMO_ALPHA, 1) + pynini.closure(keep_space + num_graph + pynini.closure( keep_space + pynini.closure(NEMO_ALPHA), 0, 1)), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure(delimiter + num_graph + pynutil.insert(" ") + alphas) serial_graph |= letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) return pynutil.add_weight(serial_graph, 2)
def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all") self.cardinal_numbers_nominative = self.get_cardinal_numbers( number_names, alternative_formats, mode="nominative" ) self.optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1 ) self.cardinal_numbers_with_optional_negative = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_default + pynutil.insert("\"") ) # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize() # "123" -> "один два три" single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative) self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize() optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"") optional_quantity = pynini.closure( (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1 ) serial_graph = self.get_serial_graph() final_graph = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_with_leading_zeros + pynutil.insert("\"") + optional_quantity ).optimize() final_graph = pynutil.add_weight(final_graph, -0.1) final_graph |= ( pynutil.insert("integer: \"") + pynutil.add_weight(self.single_digits_graph | serial_graph, 10) + pynutil.insert("\"") ) self.final_graph = final_graph # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings) final_graph |= pynini.compose( pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph), NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def testFilledExporter(self): """Export two FSTs.""" exporter = export.Exporter(self._filename) exporter['FST1'] = pynini.accep('1234') exporter['FST2'] = pynini.accep('4321') exporter.close() stored_fsts = _read_fst_map(self._filename) self.assertLen(stored_fsts, 2) self.assertTrue(stored_fsts['FST1']) self.assertTrue(stored_fsts['FST2'])
def __construct_verbal_pref_stems(self): ''' Verbal prefix stems ''' with pynini.default_token_type(self.__syms.alphabet): return pynini.compose( self.__pref_stems, self.__syms.initial_features.closure() + pynini.accep("<Pref_Stems>") + self.__sigma_star + pynini.accep("<V>", token_type=self.__syms.alphabet) + self.__sigma_star ).optimize()
def __construct_umlautung(self): ''' Map "a", "o" and "u" onto "ä", "ö" and "ü", corresp., if the umlaut marker "<UL>" is present. ''' with pynini.default_token_type(self.__syms.alphabet): alphabet = pynini.union( self.__syms.characters, pynini.string_map(["<n>", "<e>", "<d>", "<~n>", "<Ge-Nom>", "<SS>", "<FB>", "<ge>", "<Ge>", "<no-ge>", "<Ge>", "<Initial>", "<NoHy>", "<NoPref>", "<NoDef>"]).project("input"), self.__syms.stem_types, self.__syms.categories, ).closure() return pynini.concat( pynini.concat( alphabet, pynini.concat( self.__syms.consonants, pynini.concat( pynini.union( pynini.union( pynini.cross("a", "ä"), pynini.cross("o", "ö"), pynini.cross("u", "ü") ), pynini.concat( pynini.cross("a", "ä"), pynini.union( pynini.cross("a", ""), pynini.accep("u") ) ) ), pynini.concat( self.__syms.consonants.closure(), pynini.concat( pynini.concat( pynini.accep("e"), pynini.string_map(["l", "r"]).project("input") ).closure(0, 1), pynini.concat( pynini.accep("<Suff_Stems>"), pynini.cross("<UL>", "") ) ) ) ) ).closure(0, 1) ), self.__tail ).optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize() graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) if not deterministic: phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(pynini.accep(" "), 0, 1) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.closure(pynini.accep(" "), 0, 1) + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def __init__( self, itn_cardinal_tagger: GraphFst, tn_date_tagger: GraphFst, tn_date_verbalizer: GraphFst, deterministic: bool = True, ): super().__init__(name="date", kind="classify", deterministic=deterministic) add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ", weight=0.0001)) tagger = tn_date_verbalizer.graph.invert().optimize() delete_day_marker = ( pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ itn_cardinal_tagger.graph_no_exception month_as_number = pynutil.delete("month: \"") + itn_cardinal_tagger.graph_no_exception + pynutil.delete("\"") month_as_string = pynutil.delete("month: \"") + tn_date_tagger.month_abbr.invert() + pynutil.delete("\"") convert_year = (tn_date_tagger.year @ optional_delete_space).invert().optimize() delete_year_marker = ( pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) @ convert_year # day. month as string (year) verbalizer = ( pynini.closure(delete_day_marker + pynutil.insert(".") + pynini.accep(" "), 0, 1) + month_as_string + pynini.closure(pynini.accep(" ") + delete_year_marker, 0, 1) ) # day. month as number (year) verbalizer |= ( delete_day_marker @ add_leading_zero_to_double_digit + pynutil.insert(".") + pynutil.delete(" ") + month_as_number @ add_leading_zero_to_double_digit + pynutil.insert(".") + pynini.closure(pynutil.delete(" ") + delete_year_marker, 0, 1) ) # year verbalizer |= delete_year_marker final_graph = tagger @ verbalizer graph = pynutil.insert("name: \"") + convert_space(final_graph) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) def get_input_symbols(f): accepted_symbols = [] with open(f, 'r', encoding='utf-8') as f: for line in f: symbol, _ = line.split('\t') accepted_symbols.append(pynini.accep(symbol)) return accepted_symbols accepted_symbols = get_input_symbols( get_abs_path("data/electronic/symbols.tsv")) accepted_common_domains = get_input_symbols( get_abs_path("data/electronic/domain.tsv")) accepted_symbols = NEMO_ALPHA + pynini.closure( NEMO_ALPHA | NEMO_DIGIT | pynini.union(*accepted_symbols)) graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbols.tsv")).optimize() username = pynutil.insert( "username: \"") + accepted_symbols + pynutil.insert( "\"") + pynini.cross('@', ' ') domain_graph = accepted_symbols + pynini.accep('.') + accepted_symbols domain_graph = pynutil.insert( "domain: \"") + domain_graph + pynutil.insert("\"") domain_common_graph = (pynutil.insert("domain: \"") + accepted_symbols + pynini.union(*accepted_common_domains) + pynutil.insert("\"")) protocol_start = pynini.accep("https://") | pynini.accep("http://") protocol_symbols = pynini.closure((NEMO_ALPHA | pynutil.add_weight( graph_symbols | pynini.cross(":", "colon"), -0.1)) + pynutil.insert(" ")) protocol_end = pynini.accep("www.") protocol = protocol_start | protocol_end | (protocol_start + protocol_end) protocol = pynini.compose(protocol, protocol_symbols) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") graph = username + domain_graph graph |= domain_common_graph graph |= protocol + pynutil.insert(" ") + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic=False): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph endings = ["ter", "tes", "tem", "te", "ten"] self.graph = ( (pynini.closure(NEMO_DIGIT | pynini.accep(".")) + pynutil.delete( pynutil.add_weight(pynini.union(*endings), weight=0.0001) | pynini.accep("."))) @ cardinal_graph).optimize() final_graph = pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, category: Category, *features_and_values: str) -> None: """Sets up an acceptor for the defined category. Args: category: a Category. *features_and_values: list of strings, consisting of specific feature-value settings such as "num=sg", "gen=mas", etc. Raises: Error: No features_and_values provided. Error: Invalid name. """ if not features_and_values: raise Error("No features_and_values provided") self._category = category self._feature_settings = {} valid_names = frozenset(f.name for f in category.features) for feature_and_value in features_and_values: (f, v) = feature_and_value.split("=") if f not in valid_names: raise Error(f"Invalid name: {f}") self._feature_settings[f] = v acceptors = [] for feature in category.features: if feature.name in self._feature_settings: if self._feature_settings[feature.name] not in feature.values: raise Error(f"Invalid name: {feature.name}") acceptors.append( pynini.accep( f"[{feature.name}={self._feature_settings[feature.name]}]")) else: # If not specified, allows all values. acceptors.append(feature.acceptor) self._acceptor = _concatstar(acceptors)
def __init__(self, name: str, *values: str, default: Optional[str] = None) -> None: """Sets up an acceptor for the defined features. The acceptor accepts anything in [name=v] for v in values. Args: name: a string, the name for this feature (e.g. "gender") *values: one or more values (e.g. "masc", "fem", "neu") default: if set, is the default value for this feature, which is added to values if not already there. """ if not values: Error("No values provided to Feature object") self._name = name self._values = list(values) self._default = default self._default_acceptor = None if self._default: if self._default not in self._values: self._values.append(self._default) self._default_acceptor = pynini.accep(f"[{name}={self._default}]") self._default_acceptor.optimize() self._acceptor = pynini.union(*(f"[{self._name}={v}]" for v in self._values)) self._acceptor.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) suffix = OrdinalFst().suffix integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ") denominator = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) @ suffix | pynini.cross('four', 'quarter') ) conjunction = pynutil.insert("and ") if not deterministic: conjunction = pynini.closure(conjunction, 0, 1) integer = pynini.closure(integer + insert_space + conjunction, 0, 1) denominator_half = pynini.cross("numerator: \"one\" denominator: \"two\"", "a half") denominator_one_two = pynini.cross("denominator: \"one\"", "over one") | pynini.cross( "denominator: \"two\"", "halves" ) fraction_default = pynutil.add_weight( numerator + insert_space + denominator + pynutil.insert("s") + pynutil.delete("\""), 0.001 ) fraction_with_one = pynutil.add_weight( numerator_one + insert_space + denominator + pynutil.delete("\""), 0.0001 ) graph = integer + denominator_half | (fraction_with_one | fraction_default) graph |= pynini.cross("numerator: \"one\" denominator: \"two\"", "one half") graph |= (numerator | numerator_one) + insert_space + denominator_one_two self.graph = graph delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) separator = pynini.accep(" ") # between components digit = pynini.union(*list(map(str, range(1, 10)))) @ tn_cardinal_tagger.two_digit_non_zero zero = pynini.cross("0", "null") number_part = ( pynutil.delete("(") + zero + insert_space + pynini.closure(digit + insert_space, 2, 2) + digit + pynutil.delete(")") + separator + pynini.closure(digit + insert_space, 3, 3) + digit + pynutil.delete("-") + insert_space + pynini.closure(digit + insert_space, 3, 3) + digit ) graph = convert_space(pynini.invert(number_part)) final_graph = pynutil.insert("name: \"") + graph + pynutil.insert("\"") self.fst = final_graph.optimize()
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) dot = pynini.accep(".") # A.B.C. -> A. B. C. graph = NEMO_UPPER + dot + pynini.closure( insert_space + NEMO_UPPER + dot, 1) # A.B.C. -> A.B.C. graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1) # ABC -> ABC graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1) # ABC -> A B C graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, number_names: dict, alternative_formats: dict, deterministic=False): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) one_thousand_alternative = alternative_formats['one_thousand_alternative'] separators = alternative_formats['separators'] ordinal = number_names['ordinal_number_names'] ordinal |= ordinal @ one_thousand_alternative ordinal_numbers = separators @ ordinal # to handle cases like 2-ая endings = pynini.string_file(get_abs_path("data/numbers/ordinal_endings.tsv")) not_dash = pynini.closure(pynini.difference(NEMO_SIGMA, "-")) del_ending = pynini.cdrewrite(pynini.cross("-" + not_dash, ""), "", "[EOS]", NEMO_SIGMA) ordinal_numbers_marked = ( ((separators @ ordinal).optimize() + pynini.accep("-") + not_dash).optimize() @ (NEMO_SIGMA + endings).optimize() @ del_ending ).optimize() self.ordinal_numbers = ordinal_numbers # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.ordinal_numbers_with_leading_zeros = (leading_zeros + ordinal_numbers).optimize() final_graph = (ordinal_numbers | ordinal_numbers_marked).optimize() final_graph = pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()