def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"de_tn_{deterministic}_deterministic_verbalizer.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["verbalize"] logging.info( f'VerbalizeFinalFst graph was restored from {far_file}.') else: verbalize = VerbalizeFst(deterministic=deterministic).fst word = WordFst(deterministic=deterministic).fst types = verbalize | word graph = (pynutil.delete("tokens") + delete_space + pynutil.delete("{") + delete_space + types + delete_space + pynutil.delete("}")) graph = delete_space + pynini.closure( graph + delete_extra_space) + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"verbalize": self.fst}) logging.info( f"VerbalizeFinalFst grammars are saved to {far_file}.")
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify") far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "_en_itn.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst() cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal) decimal_graph = decimal.fst measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst date_graph = DateFst(ordinal=ordinal).fst word_graph = WordFst().fst time_graph = TimeFst().fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst whitelist_graph = WhiteListFst().fst punct_graph = PunctuationFst().fst electronic_graph = ElectronicFst().fst telephone_graph = TelephoneFst(cardinal).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__( self, input_case: str, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") word_graph = WordFst(deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct), 1, ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = ( token_plus_punct + pynini.closure( ( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" ")) ) + token_plus_punct ).optimize() ) graph = delete_space + graph + delete_space graph |= punct self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "en_tn_post_processing.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] logging.info( f'Post processing graph was restored from {far_file}.') else: self.set_punct_dict() self.fst = self.get_punct_postprocess_graph() if far_file: generator_main(far_file, {"post_process_graph": self.fst})
def export_grammars(output_dir, grammars): """ Exports tokenizer_and_classify and verbalize Fsts as OpenFst finite state archive (FAR) files. Args: output_dir: directory to export FAR files to. Subdirectories will be created for tagger and verbalizer respectively. grammars: grammars to be exported """ for category, graphs in grammars.items(): out_dir = os.path.join(output_dir, category) if not os.path.exists(out_dir): os.makedirs(out_dir) time.sleep(1) if category == "classify": category = "tokenize_and_classify" generator_main(f"{out_dir}/{category}.far", graphs)
def __init__( self, input_case: str, deterministic: bool = False, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_de_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars. This might take some time...") self.cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = self.cardinal.fst self.ordinal = OrdinalFst(cardinal=self.cardinal, deterministic=deterministic) ordinal_graph = self.ordinal.fst self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst self.fraction = FractionFst(cardinal=self.cardinal, deterministic=deterministic) fraction_graph = self.fraction.fst self.measure = MeasureFst( cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic ) measure_graph = self.measure.fst self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic) date_graph = self.date.fst word_graph = WordFst(deterministic=deterministic).fst self.time = TimeFst(deterministic=deterministic) time_graph = self.time.fst self.telephone = TelephoneFst(cardinal=self.cardinal, deterministic=deterministic) telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = token_plus_punct + pynini.closure(pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(self.fst, no_digits).optimize() logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) deterministic_ordinal = OrdinalFst(cardinal=cardinal, deterministic=True) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=deterministic).graph serial_graph = SerialFst(cardinal=cardinal, ordinal=deterministic_ordinal, deterministic=deterministic).fst # VERBALIZERS cardinal = vCardinal(deterministic=deterministic) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=deterministic) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=deterministic) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=deterministic) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=deterministic).fst v_electronic_graph = vElectronic(deterministic=deterministic).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=deterministic).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst v_abbreviation = vAbbreviation(deterministic=deterministic).fst det_v_time_graph = vTime(deterministic=True).fst det_v_date_graph = vDate(ordinal=vOrdinal(deterministic=True), deterministic=True).fst time_final = pynini.compose(time_graph, det_v_time_graph) date_final = pynini.compose(date_graph, det_v_date_graph) range_graph = RangeFst(time=time_final, date=date_final, cardinal=CardinalFst(deterministic=True), deterministic=deterministic).fst v_word_graph = vWord(deterministic=deterministic).fst sem_w = 1 word_w = 100 punct_w = 2 classify_and_verbalize = ( pynutil.add_weight(whitelist_graph, sem_w) | pynutil.add_weight(pynini.compose(time_graph, v_time_graph), sem_w) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), sem_w) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), sem_w) | pynutil.add_weight( pynini.compose(cardinal_graph, v_cardinal_graph), sem_w) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), sem_w) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), sem_w) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), sem_w) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), sem_w) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), sem_w) | pynutil.add_weight(word_graph, word_w) | pynutil.add_weight(pynini.compose(date_graph, v_date_graph), sem_w - 0.01) | pynutil.add_weight(pynini.compose(range_graph, v_word_graph), sem_w) | pynutil.add_weight( pynini.compose(serial_graph, v_word_graph), 1.1001) # should be higher than the rest of the classes ).optimize() if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), word_w) abbreviation_graph = AbbreviationFst( whitelist=whitelist, deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(abbreviation_graph, v_abbreviation), word_w) punct_only = pynutil.add_weight(punct_graph, weight=punct_w) punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct_only), 1, ) token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph |= punct_only + pynini.closure(punct) graph = delete_space + graph + delete_space remove_extra_spaces = pynini.closure( NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1)) remove_extra_spaces |= ( pynini.closure(pynutil.delete(" "), 1) + pynini.closure(NEMO_NOT_SPACE, 1) + pynini.closure(delete_extra_space + pynini.closure(NEMO_NOT_SPACE, 1))) graph = pynini.compose(graph.optimize(), remove_extra_spaces).optimize() self.fst = graph no_digits = pynini.closure(pynini.difference( NEMO_CHAR, NEMO_DIGIT)) self.fst_no_digits = pynini.compose(graph, no_digits).optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
""" Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats['one_thousand_alternative'] = one_thousand_alternative alternative_formats['separators'] = separators return alternative_formats if __name__ == '__main__': from nemo_text_processing.text_normalization.en.graph_utils import generator_main numbers = get_number_names() for k, v in numbers.items(): generator_main(f'{k}.far', {k: v})
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electonic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist).fst punct_graph = PunctuationFst(deterministic=deterministic).fst serial_graph = SerialFst(cardinal=cardinal, ordinal=ordinal, deterministic=deterministic).fst v_time_graph = vTimeFst(deterministic=deterministic).fst v_ordinal_graph = vOrdinalFst(deterministic=deterministic) v_date_graph = vDateFst(ordinal=v_ordinal_graph, deterministic=deterministic).fst time_final = pynini.compose(time_graph, v_time_graph) date_final = pynini.compose(date_graph, v_date_graph) range_graph = RangeFst(time=time_final, date=date_final, cardinal=cardinal, deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(range_graph, 1.1) | pynutil.add_weight( serial_graph, 1.1001) # should be higher than the rest of the classes ) # roman_graph = RomanFst(deterministic=deterministic).fst # classify |= pynutil.add_weight(roman_graph, 1.1) if not deterministic: abbreviation_graph = AbbreviationFst( deterministic=deterministic).fst classify |= pynutil.add_weight(abbreviation_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct), 1, ) classify |= pynutil.add_weight(word_graph, 100) token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( (pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" "))) + token_plus_punct) graph = delete_space + graph + delete_space graph |= punct self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__( self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode='r')['tokenize_and_classify'] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info( f'Creating ClassifyFst grammars. This might take some time...') # TAGGERS cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).graph time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) whitelist_graph = whitelist.graph punct_graph = PunctuationFst(deterministic=deterministic).graph # VERBALIZERS cardinal = vCardinal(deterministic=deterministic) v_cardinal_graph = cardinal.fst decimal = vDecimal(cardinal=cardinal, deterministic=deterministic) v_decimal_graph = decimal.fst ordinal = vOrdinal(deterministic=deterministic) v_ordinal_graph = ordinal.fst fraction = vFraction(deterministic=deterministic) v_fraction_graph = fraction.fst v_telephone_graph = vTelephone(deterministic=deterministic).fst v_electronic_graph = vElectronic(deterministic=deterministic).fst measure = vMeasure(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) v_measure_graph = measure.fst v_time_graph = vTime(deterministic=deterministic).fst v_date_graph = vDate(ordinal=ordinal, deterministic=deterministic).fst v_money_graph = vMoney(decimal=decimal, deterministic=deterministic).fst v_roman_graph = vRoman(deterministic=deterministic).fst v_abbreviation = vAbbreviation(deterministic=deterministic).fst classify_and_verbalize = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(pynini.compose(time_graph, v_time_graph), 1.1) | pynutil.add_weight( pynini.compose(decimal_graph, v_decimal_graph), 1.1) | pynutil.add_weight( pynini.compose(measure_graph, v_measure_graph), 1.1) | pynutil.add_weight( pynini.compose(cardinal_graph, v_cardinal_graph), 1.1) | pynutil.add_weight( pynini.compose(ordinal_graph, v_ordinal_graph), 1.1) | pynutil.add_weight( pynini.compose(telephone_graph, v_telephone_graph), 1.1) | pynutil.add_weight( pynini.compose(electronic_graph, v_electronic_graph), 1.1) | pynutil.add_weight( pynini.compose(fraction_graph, v_fraction_graph), 1.1) | pynutil.add_weight( pynini.compose(money_graph, v_money_graph), 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(pynini.compose(date_graph, v_date_graph), 1.09)).optimize() if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify_and_verbalize |= pynutil.add_weight( pynini.compose(roman_graph, v_roman_graph), 100) abbreviation_graph = AbbreviationFst( whitelist=whitelist, deterministic=deterministic).fst classify_and_verbalize |= pynutil.add_weight( pynini.compose(abbreviation_graph, v_abbreviation), 100) punct = pynutil.add_weight(punct_graph, weight=1.1) token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + classify_and_verbalize + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f'ClassifyFst grammars are saved to {far_file}.')
def __init__(self, input_case: str, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") cardinal = CardinalFst(deterministic=deterministic) cardinal_graph = cardinal.fst ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) ordinal_graph = ordinal.fst decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) fraction_graph = fraction.fst measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst time_graph = TimeFst(cardinal=cardinal, deterministic=deterministic).fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electonic_graph = ElectronicFst(deterministic=deterministic).fst money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100)) if not deterministic: roman_graph = RomanFst(deterministic=deterministic).fst # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens classify |= pynutil.add_weight(roman_graph, 100) abbreviation_graph = AbbreviationFst( deterministic=deterministic).fst classify |= pynutil.add_weight(abbreviation_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, input_case: str, deterministic: bool = False, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) if deterministic: raise ValueError( 'Ru TN only supports non-deterministic cases and produces multiple normalization options.' ) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join( cache_dir, f"_{input_case}_ru_tn_{deterministic}_deterministic.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info( f"Creating ClassifyFst grammars. This might take some time...") number_names = get_number_names() alternative_formats = get_alternative_formats() self.cardinal = CardinalFst( number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic) cardinal_graph = self.cardinal.fst self.ordinal = OrdinalFst(number_names=number_names, alternative_formats=alternative_formats, deterministic=deterministic) ordinal_graph = self.ordinal.fst self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic) decimal_graph = self.decimal.fst self.measure = MeasureFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) measure_graph = self.measure.fst self.date = DateFst(number_names=number_names, deterministic=deterministic) date_graph = self.date.fst word_graph = WordFst(deterministic=deterministic).fst self.time = TimeFst(number_names=number_names, deterministic=deterministic) time_graph = self.time.fst self.telephone = TelephoneFst(number_names=number_names, deterministic=deterministic) telephone_graph = self.telephone.fst self.electronic = ElectronicFst(deterministic=deterministic) electronic_graph = self.electronic.fst self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic) money_graph = self.money.fst self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic) whitelist_graph = self.whitelist.fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = (pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.09) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 0.9) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure( pynutil.add_weight(delete_extra_space, 1.1) + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != 'None': os.makedirs(cache_dir, exist_ok=True) far_file = os.path.join(cache_dir, "_de_itn.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") tn_cardinal_tagger = TNCardinalTagger(deterministic=False) tn_date_tagger = TNDateTagger(cardinal=tn_cardinal_tagger, deterministic=False) tn_decimal_tagger = TNDecimalTagger(cardinal=tn_cardinal_tagger, deterministic=False) tn_ordinal_verbalizer = TNOrdinalVerbalizer(deterministic=False) tn_fraction_verbalizer = TNFractionVerbalizer( ordinal=tn_ordinal_verbalizer, deterministic=False) tn_time_verbalizer = TNTimeVerbalizer( cardinal_tagger=tn_cardinal_tagger, deterministic=False) tn_date_verbalizer = TNDateVerbalizer( ordinal=tn_ordinal_verbalizer, deterministic=False) tn_electronic_tagger = TNElectronicTagger(deterministic=False) tn_electronic_verbalizer = TNElectronicVerbalizer( deterministic=False) tn_whitelist_tagger = TNWhitelistTagger(input_case="cased", deterministic=False) cardinal = CardinalFst(tn_cardinal_tagger=tn_cardinal_tagger) cardinal_graph = cardinal.fst ordinal = OrdinalFst(itn_cardinal_tagger=cardinal, tn_ordinal_verbalizer=tn_ordinal_verbalizer) ordinal_graph = ordinal.fst decimal = DecimalFst(itn_cardinal_tagger=cardinal, tn_decimal_tagger=tn_decimal_tagger) decimal_graph = decimal.fst fraction = FractionFst( itn_cardinal_tagger=cardinal, tn_fraction_verbalizer=tn_fraction_verbalizer) fraction_graph = fraction.fst measure_graph = MeasureFst(itn_cardinal_tagger=cardinal, itn_decimal_tagger=decimal, itn_fraction_tagger=fraction).fst date_graph = DateFst(itn_cardinal_tagger=cardinal, tn_date_verbalizer=tn_date_verbalizer, tn_date_tagger=tn_date_tagger).fst word_graph = WordFst().fst time_graph = TimeFst(tn_time_verbalizer=tn_time_verbalizer).fst money_graph = MoneyFst(itn_cardinal_tagger=cardinal, itn_decimal_tagger=decimal).fst whitelist_graph = WhiteListFst( tn_whitelist_tagger=tn_whitelist_tagger).fst punct_graph = PunctuationFst().fst electronic_graph = ElectronicFst( tn_electronic_tagger=tn_electronic_tagger, tn_electronic_verbalizer=tn_electronic_verbalizer).fst telephone_graph = TelephoneFst( tn_cardinal_tagger=tn_cardinal_tagger).fst classify = (pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(whitelist_graph, 1.0) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) | pynutil.add_weight(word_graph, 100)) punct = pynutil.insert("tokens { ") + pynutil.add_weight( punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert( " }") token_plus_punct = (pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)) graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")