def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) # Adapted from # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm # Specifies common ways of delimiting thousands in digit strings. t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats[ 'one_thousand_alternative'] = one_thousand_alternative.optimize() alternative_formats['separators'] = separators.optimize() return alternative_formats
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify") far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "_en_itn.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst() cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal) decimal_graph = decimal.fst measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst date_graph = DateFst(ordinal=ordinal).fst word_graph = WordFst().fst time_graph = TimeFst().fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst whitelist_graph = WhiteListFst().fst punct_graph = PunctuationFst().fst electronic_graph = ElectronicFst().fst telephone_graph = TelephoneFst(cardinal).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"de_tn_{deterministic}_deterministic_verbalizer.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["verbalize"] logging.info( f'VerbalizeFinalFst graph was restored from {far_file}.') else: verbalize = VerbalizeFst(deterministic=deterministic).fst word = WordFst(deterministic=deterministic).fst types = verbalize | word graph = (pynutil.delete("tokens") + delete_space + pynutil.delete("{") + delete_space + types + delete_space + pynutil.delete("}")) graph = delete_space + pynini.closure( graph + delete_extra_space) + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"verbalize": self.fst}) logging.info( f"VerbalizeFinalFst grammars are saved to {far_file}.")
def __init__( self, input_case: str, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") word_graph = WordFst(deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct), 1, ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = ( token_plus_punct + pynini.closure( ( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" ")) ) + token_plus_punct ).optimize() ) graph = delete_space + graph + delete_space graph |= punct self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def setUpClass(cls): super().setUpClass() fold = pynini.string_map((("A", "a"), ("B", "b"))).optimize() cls.far_path = tempfile.mkstemp(suffix=".far")[1] with pynini.Far(cls.far_path, "w") as far: far["DOWNCASE"] = fold far["UPCASE"] = fold.invert() cls.cascade = rule_cascade.RuleCascade(cls.far_path)
def test_iso_roundtrip(self, tag: str): tag = tag.upper() far_path = u.FAR_DIR / 'iso.far' with pynini.Far(file.AsResourcePath(far_path), 'r') as far: natv_to_iso = far[f'FROM_{tag}'] iso_to_natv = far[f'TO_{tag}'] self.assertFstProbablyIdentity([natv_to_iso, iso_to_natv], token_type='byte', samples=test_util.NUM_TEST_SAMPLES)
def test_romanization_roundtrip(self): far_path = u.FAR_DIR / 'reversible_roman.far' with pynini.Far(uf.AsResourcePath(far_path), 'r') as far: natv_to_latin = far['FROM_ARAB'] latin_to_natv = far['TO_ARAB'] round_trip = natv_to_latin @ latin_to_natv self.assertFstProbablyFunctional(round_trip, token_type='byte', samples=ut.NUM_TEST_SAMPLES)
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def testFilledExporter(self): """Export two FSTs and check that they are stored in the file.""" filename = os.path.join(FLAGS.test_tmpdir, 'test.far') FLAGS.output = filename with self.assertRaises(SystemExit): grm.run(generator_method) with pynini.Far(filename, 'r') as far: stored_fsts = dict(far) self.assertLen(stored_fsts, 2) self.assertTrue(stored_fsts['FST1']) self.assertTrue(stored_fsts['FST2'])
def OpenFstFromFarSafe(far_dir: pathlib.Path, far_name: str, token_type: str, fst_name: str, default: pynini.Fst) -> pynini.Fst: """Returns FST from a given FAR; returns default if FST is not found.""" tt_suffix = {"byte": "", "utf8": "_utf8"}[token_type] far_path = far_dir / f"{far_name}{tt_suffix}.far" if not IsFileExist(far_path): return default with pynini.Far(AsResourcePath(far_path), "r") as far: try: return far[fst_name.upper()] except KeyError: return default
def setUp(self): super().setUp() self._letters_proto = letter_languages.read_textproto( u.LANG_DIR / 'letter_languages.textproto') self._roman_proto = unicode_strings_util.read_textproto( u.LANG_DIR / 'reversible_roman.textproto') far_path = u.FAR_DIR / 'reversible_roman.far' with pynini.Far(uf.AsResourcePath(far_path), 'r') as far: natv_to_roman = far['FROM_ARAB'] roman_to_natv = far['TO_ARAB'] self._round_trip = natv_to_roman @ roman_to_natv
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "en_tn_post_processing.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] logging.info( f'Post processing graph was restored from {far_file}.') else: self.set_punct_dict() self.fst = self.get_punct_postprocess_graph() if far_file: generator_main(far_file, {"post_process_graph": self.fst})
def run_experiments(roots1, roots2): """Runs FLAGS.number_of_experiments experiments. Args: roots1: A Roots class instance roots2: A Roots class instance """ mapping_rule = py.Far(FLAGS.far)[FLAGS.mapping_rule] for i in range(FLAGS.number_of_experiments): zipped = produce_paired_etyma(roots1, roots2) success = 0 for (e1, e2) in zipped: if best_score( e1 * mapping_rule * e2) <= FLAGS.levenshtein_threshold: print("{}\t{}".format(e1, e2)) success += 1 print("RUN:\t{}\t{}".format(i, success)) sys.stdout.flush()
def main(unused_argv): far = py.Far(FLAGS.far) fst = far[FLAGS.rule] # Note that we tried to push weights to the beginning so that we don"t get # spurious selection of "free" cases where the first byte of a UTF8 character # has no weight. # # fst = py.push(fst, push_weights=True, to_final=False) # # However this seems to produce artifacts of its own like endless series of # Greek roots starting with "drai". On the other hand without it PAN gets # endless roots starting with ñ. if FLAGS.push: fst = py.push(fst, push_weights=True, to_final=False) rand = py.randgen(fst, npath=FLAGS.npaths, seed=int(time.time()), select="log_prob", weighted=True) print(Counter([p for p in rand.paths().ostrings()]))
def __init__(self): super().__init__(name="cardinal", kind="classify") graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats['one_thousand_alternative'] = one_thousand_alternative alternative_formats['separators'] = separators return alternative_formats
def close(self) -> None: """Writes the registered FSTs into the given file and closes it.""" assert self._is_open logging.info('Writing FSTs into \'%s\'.', self._filename) # TODO(b/123775699): Currently pytype is unable to resolve # the usage of typing.Literal for pynini.Far.__init__'s far_type, producing # the error: # # Expected: (self, filename, mode, arc_type, far_type: Literal[str] = ...) # Actually passed: (self, filename, mode, arc_type, far_type: Literal[str]) # # Once typing.Literal support no longer makes this error, drop # the below pytype disable comment. with pynini.Far(self._filename, 'w', arc_type=self._arc_type, far_type=self._far_type) as sink: # pytype: disable=wrong-arg-types for name in sorted(self._fsts): logging.info('Writing FST \'%s\' into \'%s\'.', name, self._filename) sink[name] = self._fsts[name] logging.info('Writing FSTs into \'%s\' done.', self._filename) self._is_open = False
def _LoadFar(self) -> pynini.Far: return pynini.Far(uf.AsResourcePath(self._path_to_far))
def get_number_names(): """ Creates numbers names. Based on: 1) Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization. Transactions of the Association for Computational Linguistics 4: 507-519. and 2) Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised written-to-spoken text normalization. In ASRU, pages 665-670. """ a = pynini.Far(get_abs_path('data/utils/util_arithmetic.far'), mode='r') d = a['DELTA_STAR'] f = a['IARITHMETIC_RESTRICTED'] g = pynini.Fst.read(get_abs_path('data/utils/g.fst')) fg = (d @ (f @ (f @ (f @ g).optimize()).optimize()).optimize()).optimize() assert rewrite.top_rewrite("230", fg) == "(+ 200 30 +)" # Compiles lexicon transducers (L). cardinal_name_nominative = pynini.string_file( get_abs_path("data/numbers/1_cardinals_nominative_именительный.tsv") ).optimize() cardinal_name_genitive = pynini.string_file( get_abs_path( "data/numbers/2_cardinals_genitive_родительный.tsv")).optimize() cardinal_name_dative = pynini.string_file( get_abs_path( "data/numbers/3_cardinals_dative_датильный.tsv")).optimize() cardinal_name_accusative = pynini.string_file( get_abs_path( "data/numbers/4_cardinals_accusative_винительный.tsv")).optimize() cardinal_name_instrumental = pynini.string_file( get_abs_path("data/numbers/5_cardinals_instrumental_творительный.tsv") ).optimize() cardinal_name_prepositional = pynini.string_file( get_abs_path("data/numbers/6_cardinals_prepositional_предложный.tsv") ).optimize() cardinal_l = ( pynini.closure(cardinal_name_nominative + pynini.accep(" ")) + cardinal_name_nominative).optimize() for case in [ cardinal_name_genitive, cardinal_name_dative, cardinal_name_accusative, cardinal_name_instrumental, cardinal_name_prepositional, ]: cardinal_l |= (pynini.closure(case + pynini.accep(" ")) + case).optimize() # Numbers up to 1000 in nominative case (to use, for example, with telephone) nominative_up_to_thousand_name = pynini.string_file( get_abs_path("data/numbers/cardinals_nominative_case.tsv")) nominative_up_to_thousand_name_l = ( pynini.closure(nominative_up_to_thousand_name + pynini.accep(" ")) + nominative_up_to_thousand_name).optimize() # Convert e.g. "(* 5 1000 *)" back to "5000" so complex ordinals will be formed correctly, # e.g. "пятитысячный" will eventually be formed. (If we didn't do this, the incorrect phrase # "пять тысячный" would be formed). # We do this for all thousands from "(*2 1000 *)" —> "2000" to "(*20 1000 *)" —> "20000". # We do not go higher, in order to prevent the WFST graph becoming even larger. complex_numbers = pynini.cross("(* 2 1000 *)", "2000") for number in range(3, 21): complex_numbers |= pynini.cross(f"(* {number} 1000 *)", f"{number}000") complex_numbers = (NEMO_SIGMA + pynutil.add_weight(complex_numbers, -1) + pynini.closure(pynini.union(" ", ")", "(", "+", "*"))) fg_ordinal = pynutil.add_weight(pynini.compose(fg, complex_numbers), -1) | fg ordinal_name = pynini.string_file( get_abs_path("data/numbers/ordinals.tsv")) ordinal_l = (pynini.closure(cardinal_name_nominative + pynini.accep(" ")) + ordinal_name).optimize() # Composes L with the leaf transducer (P), then composes that with FG. p = a['LEAVES'] number_names = {} number_names['ordinal_number_names'] = ( fg_ordinal @ (p @ ordinal_l)).optimize() number_names['cardinal_number_names'] = (fg @ (p @ cardinal_l)).optimize() number_names['nominative_up_to_thousand_names'] = ( fg @ (p @ nominative_up_to_thousand_name_l)).optimize() return number_names
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electonic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist).fst punct_graph = PunctuationFst(deterministic=deterministic).fst serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic).fst v_time_graph = vTimeFst(deterministic=deterministic).fst v_ordinal_graph = vOrdinalFst(deterministic=deterministic) v_date_graph = vDateFst(ordinal=v_ordinal_graph, deterministic=deterministic).fst time_final = pynini.compose(time_graph, v_time_graph) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst(time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(range_graph, 1.1) | pynutil.add_weight( serial_graph, 1.1001) # should be higher than the rest of the classes ) # roman_graph = RomanFst(deterministic=deterministic).fst # classify |= pynutil.add_weight(roman_graph, 1.1) if not deterministic: abbreviation_graph = AbbreviationFst( deterministic=deterministic).fst classify |= pynutil.add_weight(abbreviation_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct), 1, ) classify |= pynutil.add_weight(word_graph, 100) token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph = delete_space + graph + delete_space graph |= punct self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def _read_fst_map(filename): with pynini.Far(filename) as far: stored_fsts = dict(far) return stored_fsts
closure).optimize() #Vowel insertion to break consonant clusters caused by suffixes insertion = pynini.cdrewrite(pynini.transducer("", "e"), consonants, suffixes, closure).optimize() #Finnish seems to attempt preserving morae count with /s/ as a syllabic end. Generates a stop that assimilates 'highness' of vowel and becomes /k/ #In case this generated stop occurs after VV, it instead assimilates /s/ and becomes /t/. Then gradation occurs due to /e/ insertion #Similar situation seemed to occur with /s/ -> /a/ / /a/_ + suffix. So was added to transducer. final_stress_preservation = pynini.cdrewrite( pynini.transducer("s", "t"), vowels + (pynini.acceptor("y") | "u"), suffixes, closure) * pynini.cdrewrite( pynini.transducer("", "k"), pynini.acceptor("y") | "u", "s" + suffixes, closure) * pynini.cdrewrite( pynini.transducer("s", "a"), "a", suffixes, closure) final_stress_preservation.optimize() #Rule for /nt/ assimilation. nt_assimilation = pynini.cdrewrite(pynini.transducer("t", "n"), "n", vowels + suffixes, closure).optimize() #Intersection of rules transducer_adessive = regularize * transducer_adessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize transducer_inessive = regularize * transducer_inessive_base * nt_assimilation * final_stress_preservation * insertion * consonant_reduction * rvregularize #########################Generates FAR ###############################3 with pynini.Far("finnish.far", "w") as sink: sink["ADESSIVE"] = transducer_adessive sink["INESSIVE"] = transducer_inessive
def __init__(self, far_path: str): self.far = pynini.Far(far_path, "r") self.rules = []
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=deterministic).graph # VERBALIZERS cardinal = vCardinal(deterministic=deterministic) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=deterministic) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=deterministic) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=deterministic) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=deterministic).fst v_electronic_graph = vElectronic(deterministic=deterministic).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=deterministic).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst v_abbreviation = vAbbreviation(deterministic=deterministic).fst classify_and_verbalize = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(pynini.compose(time_graph, v_time_graph), 1.1) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), 1.1) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), 1.1) | pynutil.add_weight( pynini.compose(cardinal_graph, v_cardinal_graph), 1.1) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), 1.1) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), 1.1) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), 1.1) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), 1.1) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(pynini.compose(date_graph, v_date_graph), 1.09)).optimize() if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), 100) abbreviation_graph = AbbreviationFst( whitelist=whitelist, deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(abbreviation_graph, v_abbreviation), 100) punct = pynutil.add_weight(punct_graph, weight=1.1) token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
def __init__(self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electonic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100)) if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify |= pynutil.add_weight(roman_graph, 100) abbreviation_graph = AbbreviationFst( deterministic=deterministic).fst classify |= pynutil.add_weight(abbreviation_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # TODO repalce to have "oh" as a default for "0" graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynini.invert(graph_digit | graph_zero) self.single_digits_graph = single_digits_graph + pynini.closure( insert_space + single_digits_graph) if not deterministic: # for a single token allow only the same normalization # "007" -> {"oh oh seven", "zero zero seven"} not {"oh zero seven"} single_digits_graph_zero = pynini.invert(graph_digit | graph_zero) single_digits_graph_oh = pynini.invert(graph_digit) | pynini.cross( "0", "oh") self.single_digits_graph = single_digits_graph_zero + pynini.closure( insert_space + single_digits_graph_zero) self.single_digits_graph |= single_digits_graph_oh + pynini.closure( insert_space + single_digits_graph_oh) single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + insert_space, 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + insert_space + single_digits_graph + insert_space + single_digits_graph, 1, ) self.range_graph = pynutil.insert( "from ") + self.graph + pynini.cross("-", " to ") + self.graph self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph |= (pynutil.insert("from ") + get_hundreds_graph() + pynini.cross("-", " to ") + get_hundreds_graph()) self.range_graph = self.range_graph.optimize() serial_graph = self.get_serial_graph() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) if deterministic: long_numbers = pynini.compose(NEMO_DIGIT**(5, ...), self.single_digits_graph).optimize() final_graph = self.graph | serial_graph | pynutil.add_weight( long_numbers, -0.001) cardinal_with_leading_zeros = pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), self.single_digits_graph) final_graph |= cardinal_with_leading_zeros else: leading_zeros = pynini.compose( pynini.closure(pynini.accep("0"), 1), self.single_digits_graph) cardinal_with_leading_zeros = ( leading_zeros + pynutil.insert(" ") + pynini.compose(pynini.closure(NEMO_DIGIT), self.graph)) final_graph = (self.graph | serial_graph | self.range_graph | self.single_digits_graph | get_hundreds_graph() | pynutil.add_weight( single_digits_graph_with_commas, 0.001) | cardinal_with_leading_zeros) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) deterministic_ordinal = OrdinalFst(cardinal=cardinal, deterministic=True) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=deterministic).graph serial_graph = SerialFst(cardinal=cardinal, ordinal=deterministic_ordinal, deterministic=deterministic).fst # VERBALIZERS cardinal = vCardinal(deterministic=deterministic) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=deterministic) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=deterministic) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=deterministic) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=deterministic).fst v_electronic_graph = vElectronic(deterministic=deterministic).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=deterministic).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst v_abbreviation = vAbbreviation(deterministic=deterministic).fst det_v_time_graph = vTime(deterministic=True).fst det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True), deterministic=True).fst time_final = pynini.compose(time_graph, det_v_time_graph) date_final = pynini.compose(date_graph, det_v_date_graph) range_graph = RangeFst(time=time_final, date=date_final, cardinal=CardinalFst(deterministic=True), deterministic=deterministic).fst v_word_graph = vWord(deterministic=deterministic).fst sem_w = 1 word_w = 100 punct_w = 2 classify_and_verbalize = ( pynutil.add_weight(whitelist_graph, sem_w) | pynutil.add_weight(pynini.compose(time_graph, v_time_graph), sem_w) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), sem_w) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), sem_w) | pynutil.add_weight( pynini.compose(cardinal_graph, v_cardinal_graph), sem_w) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), sem_w) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), sem_w) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), sem_w) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), sem_w) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), sem_w) | pynutil.add_weight(word_graph, word_w) | pynutil.add_weight(pynini.compose(date_graph, v_date_graph), sem_w - 0.01) | pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w) | pynutil.add_weight( pynini.compose(serial_graph, v_word_graph), 1.1001) # should be higher than the rest of the classes ).optimize() if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), word_w) abbreviation_graph = AbbreviationFst( whitelist=whitelist, deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(abbreviation_graph, v_abbreviation), word_w) punct_only = pynutil.add_weight(punct_graph, weight=punct_w) punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct_only), 1, ) token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph |= punct_only + pynini.closure(punct) graph = delete_space + graph + delete_space remove_extra_spaces = pynini.closure( NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)) remove_extra_spaces |= ( pynini.closure(pynutil.delete(" "), 1) + pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))) graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize() self.fst = graph no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(graph, no_digits).optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
def main(args: argparse.Namespace) -> None: with pynini.Far(args.far_path, mode="r") as far_reader: encoder = pynini.EncodeMapper( far_reader.arc_type(), encode_labels=True ) fsa_path = tempfile.mkstemp(text=True)[1] with pynini.Far( fsa_path, mode="w", arc_type=far_reader.arc_type(), far_type="default", ) as far_writer: while not far_reader.done(): fst = far_reader.get_fst() assert fst.verify(), "FST is ill-formed" fst.encode(encoder) far_writer.add(far_reader.get_key(), fst) far_reader.next() count_path = tempfile.mkstemp(text=True)[1] lm_path = tempfile.mkstemp(text=True)[1] logging.info( "alignment.far is encoded to FSAs for training. Now training starts." ) cmd = [ "ngramcount", "--require_symbols=false", f"--order={args.order}", fsa_path, count_path, ] subprocess.check_call(cmd) os.remove(fsa_path) cmd1 = [ "ngrammake", f"--method={args.smoothing_method}", count_path, lm_path, ] subprocess.check_call(cmd1) os.remove(count_path) if args.shrinking_method: shrunk_lm_sh = tempfile.mkstemp(text=True)[1] cmd = [ "shrinking_method", "--method=relative_entropy", f"--target_number_of_ngrams={args.target_number_of_ngrams}", lm_path, shrunk_lm_sh, ] subprocess.check_call(cmd) lm_path = shrunk_lm_sh logging.info( "%s-gram %s Language model is trained.", args.order, args.smoothing_method, ) # Decoding the LM model = pynini.Fst.read(lm_path) os.remove(lm_path) model.decode(encoder) model.write(args.model_path) logging.info( "%s-gram %s Language model is built.", args.order, args.smoothing_method, )
def __init__( self, input_case: str, deterministic: bool = False, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars. This might take some time...") self.cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = self.cardinal.fst self.ordinal = OrdinalFst(cardinal=self.cardinal, deterministic=deterministic) ordinal_graph = self.ordinal.fst self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) date_graph = self.date.fst word_graph = WordFst(deterministic=deterministic).fst self.time = TimeFst(deterministic=deterministic) time_graph = self.time.fst self.telephone = TelephoneFst(cardinal=self.cardinal, deterministic=deterministic) telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, input_case: str, deterministic: bool = False, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) if deterministic: raise ValueError( 'Ru TN only supports non-deterministic cases and produces multiple normalization options.' ) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"_{input_case}_ru_tn_{deterministic}_deterministic.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info( f"Creating ClassifyFst grammars. This might take some time...") number_names = get_number_names() alternative_formats = get_alternative_formats() self.cardinal = CardinalFst( number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic) cardinal_graph = self.cardinal.fst self.ordinal = OrdinalFst(number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic) ordinal_graph = self.ordinal.fst self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) measure_graph = self.measure.fst self.date = DateFst(number_names=number_names, deterministic=deterministic) date_graph = self.date.fst word_graph = WordFst(deterministic=deterministic).fst self.time = TimeFst(number_names=number_names, deterministic=deterministic) time_graph = self.time.fst self.telephone = TelephoneFst(number_names=number_names, deterministic=deterministic) telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic) whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 0.9) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")