def get_hundreds_graph(deterministic: bool = True): """ Returns a four digit transducer which is combination of ties/teen or digits (using hundred instead of thousand format), e.g. 1219 -> twelve nineteen 3900 -> thirty nine hundred """ graph_ties = get_ties_graph(deterministic) graph = ( graph_ties + insert_space + graph_ties | graph_teen + insert_space + pynini.cross("00", "hundred") | (graph_teen + insert_space + (ties_graph | pynini.cross("1", "ten")) + pynutil.delete("0s")) @ pynini.cdrewrite(pynini.cross("y", "ies") | pynutil.insert("s"), "", "[EOS]", NEMO_SIGMA) | pynutil.add_weight( graph_digit + insert_space + pynini.cross("00", "thousand") + (pynutil.delete("0") | insert_space + graph_digit), weight=-0.001, ) | pynutil.add_weight( graph_digit + insert_space + pynini.cross("000", "thousand") + pynini.closure(pynutil.delete(" "), 0, 1) + pynini.accep("s"), weight=-0.001, ) ) return graph
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") # decimal, fraction, cardinal, units, style(depr) cardinal_graph = cardinal.graph_no_exception graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr graph_unit_plural = get_singulars(graph_unit_singular) # plural -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1 ) unit_singular = convert_space(graph_unit_singular) unit_plural = convert_space(graph_unit_plural) unit_misc = pynutil.insert("/") + pynutil.delete("per") + delete_space + convert_space(graph_unit_singular) unit_singular = ( pynutil.insert("units: \"") + (unit_singular | unit_misc | pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert("\"") ) unit_plural = ( pynutil.insert("units: \"") + (unit_plural | unit_misc | pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)) + pynutil.insert("\"") ) subgraph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_plural ) subgraph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_plural ) subgraph_cardinal |= ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_singular ) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) suffix = OrdinalFst().suffix integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ") denominator = pynutil.delete("denominator: \"") + ( pynini.closure(NEMO_NOT_QUOTE) @ suffix | pynini.cross('four', 'quarter') ) conjunction = pynutil.insert("and ") if not deterministic: conjunction = pynini.closure(conjunction, 0, 1) integer = pynini.closure(integer + insert_space + conjunction, 0, 1) denominator_half = pynini.cross("numerator: \"one\" denominator: \"two\"", "a half") denominator_one_two = pynini.cross("denominator: \"one\"", "over one") | pynini.cross( "denominator: \"two\"", "halves" ) fraction_default = pynutil.add_weight( numerator + insert_space + denominator + pynutil.insert("s") + pynutil.delete("\""), 0.001 ) fraction_with_one = pynutil.add_weight( numerator_one + insert_space + denominator + pynutil.delete("\""), 0.0001 ) graph = integer + denominator_half | (fraction_with_one | fraction_default) graph |= pynini.cross("numerator: \"one\" denominator: \"two\"", "one half") graph |= (numerator | numerator_one) + insert_space + denominator_one_two self.graph = graph delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self, ordinal: GraphFst, cardinal: GraphFst): super().__init__(name="date", kind="classify") self.cardinal = cardinal ordinal_graph = ordinal.graph year_graph = self._get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") day_graph = pynutil.insert("day: \"") + pynutil.add_weight( ordinal_graph, -0.7) + pynutil.insert("\"") optional_graph_year = pynini.closure( delete_extra_space + pynutil.insert("year: \"") + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\""), 0, 1, ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year graph_year = (pynutil.insert("year: \"") + year_graph + pynini.closure( pynini.accep('er') + pynini.closure(pynini.accep('n'), 0, 1), 0, 1) + pynutil.insert("\"")) final_graph = graph_dmy | graph_year final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def add_optional_and(self, graph): if not self.deterministic: graph = pynini.compose( graph, NEMO_SIGMA + pynini.closure(pynini.cross("hundred ", " "), 0, 1) + NEMO_SIGMA) not_quote = pynini.closure(NEMO_NOT_QUOTE) no_thousand_million = pynini.difference( not_quote, not_quote + pynini.union("thousand", "million") + not_quote).optimize() integer = (not_quote + pynutil.add_weight( pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001)).optimize() no_hundred = pynini.difference( NEMO_SIGMA, not_quote + pynini.accep("hundred") + not_quote).optimize() integer |= (not_quote + pynutil.add_weight( pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001)).optimize() graph_with_and = pynini.compose( graph, integer).optimize() | pynutil.add_weight(graph, 0.00001) return graph_with_and
def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike': """ Alters given fst to convert Roman integers (lower and upper cased) into Arabic numerals. Valid for values up to 1000. e.g. "V" -> "5" "i" -> "1" Args: fst: Any fst. Composes fst onto Roman conversion outputs. """ def _load_roman(file: str): roman = load_labels(get_abs_path(file)) roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman] return pynini.string_map(roman_numerals) digit = _load_roman("data/roman/digit.tsv") ties = _load_roman("data/roman/ties.tsv") hundreds = _load_roman("data/roman/hundreds.tsv") graph = ( digit | ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01)) | (hundreds + (ties | pynutil.add_weight(pynutil.insert("0"), 0.01)) + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01)))).optimize() return graph @ fst
def __init__(self, ordinal: GraphFst): super().__init__(name="date", kind="classify") ordinal_graph = ordinal.graph year_graph = _get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert( "month: \"") + month_graph + pynutil.insert("\"") day_graph = pynutil.insert("day: \"") + pynutil.add_weight( ordinal_graph, -0.7) + pynutil.insert("\"") optional_day_graph = pynini.closure(delete_extra_space + day_graph, 0, 1) optional_graph_year = pynini.closure( delete_extra_space + pynutil.insert("year: \"") + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\""), 0, 1, ) graph_mdy = month_graph + optional_day_graph + optional_graph_year graph_dmy = (pynutil.delete("the") + delete_space + day_graph + delete_space + pynutil.delete("of") + delete_extra_space + month_graph + optional_graph_year) graph_year = pynutil.insert("year: \"") + ( year_graph | _get_range_graph()) + pynutil.insert("\"") final_graph = graph_mdy | graph_dmy | graph_year final_graph += pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def get_alternative_formats(): """ Utils to get alternative formats for numbers. """ one_alternatives = load_labels( get_abs_path('data/numbers/cardinals_alternatives.tsv')) one_thousand_map = [] for k in one_alternatives: default, alternative = k one_thousand_map.append((alternative.split()[1], alternative)) one_thousand_map = pynini.string_map(one_thousand_map) one_thousand_alternative = pynini.cdrewrite(one_thousand_map, "[BOS]", "", NEMO_SIGMA) # Adapted from # https://github.com/google/TextNormalizationCoveringGrammars/blob/master/src/universal/thousands_punct.grm # Specifies common ways of delimiting thousands in digit strings. t = pynini.Far(get_abs_path('data/utils/universal_thousands_punct.far')) separators = (pynutil.add_weight(t['dot_thousands'], 0.1) | pynutil.add_weight(t['no_delimiter'], -0.1) | pynutil.add_weight(t['space_thousands'], 0.1)) alternative_formats = {} alternative_formats[ 'one_thousand_alternative'] = one_thousand_alternative.optimize() alternative_formats['separators'] = separators.optimize() return alternative_formats
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) misc_graph = pynutil.add_weight( TO_LOWER + pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)), 110) misc_graph |= pynutil.add_weight( pynini.closure(NEMO_UPPER, 2) + pynini.closure(insert_space + NEMO_LOWER, 1), 110) misc_graph |= ( NEMO_UPPER + pynutil.delete(".") + pynini.closure(insert_space + NEMO_UPPER + pynutil.delete("."))) misc_graph |= pynutil.add_weight( TO_LOWER + pynutil.delete(".") + pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110) # set weight of the misc graph to the value higher then word graph = pynutil.add_weight(main_graph.optimize(), 10) | pynutil.add_weight( misc_graph.optimize(), 101) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__( self, input_case: str, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True, whitelist: str = None, ): super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( cache_dir, f"_{input_case}_en_tn_{deterministic}_deterministic{whitelist_file}.far" ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f'ClassifyFst.fst was restored from {far_file}.') else: logging.info(f"Creating ClassifyFst grammars.") word_graph = WordFst(deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst classify = pynutil.add_weight(whitelist_graph, 1) | pynutil.add_weight(word_graph, 100) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct), 1, ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) graph = ( token_plus_punct + pynini.closure( ( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) | (pynutil.insert(" ") + punct + pynutil.insert(" ")) ) + token_plus_punct ).optimize() ) graph = delete_space + graph + delete_space graph |= punct self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) logging.info(f"ClassifyFst grammars are saved to {far_file}.")
def __init__(self): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency, style(depr) cardinal_graph = CardinalFst().graph_no_exception graph_decimal_final = DecimalFst().final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = (pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight(cardinal_graph, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= (pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, number_names: dict, alternative_formats: dict, deterministic: bool = False): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) self.cardinal_numbers_default = self.get_cardinal_numbers(number_names, alternative_formats, mode="all") self.cardinal_numbers_nominative = self.get_cardinal_numbers( number_names, alternative_formats, mode="nominative" ) self.optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1 ) self.cardinal_numbers_with_optional_negative = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_default + pynutil.insert("\"") ) # "03" -> remove leading zeros and verbalize leading_zeros = pynini.closure(pynini.cross("0", "")) self.cardinal_numbers_with_leading_zeros = (leading_zeros + self.cardinal_numbers_default).optimize() # "123" -> "один два три" single_digits_graph = pynini.compose(NEMO_DIGIT, self.cardinal_numbers_nominative) self.single_digits_graph = single_digits_graph + pynini.closure(insert_space + single_digits_graph) optional_quantity = pynini.string_file(get_abs_path("data/numbers/quantity.tsv")).optimize() optional_quantity = pynutil.insert("quantity: \"") + optional_quantity + pynutil.insert("\"") optional_quantity = pynini.closure( (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + optional_quantity, 0, 1 ) serial_graph = self.get_serial_graph() final_graph = ( self.optional_graph_negative + pynutil.insert("integer: \"") + self.cardinal_numbers_with_leading_zeros + pynutil.insert("\"") + optional_quantity ).optimize() final_graph = pynutil.add_weight(final_graph, -0.1) final_graph |= ( pynutil.insert("integer: \"") + pynutil.add_weight(self.single_digits_graph | serial_graph, 10) + pynutil.insert("\"") ) self.final_graph = final_graph # to cover cases "2-х" -> "двух" (this is not covered by ordinal endings) final_graph |= pynini.compose( pynini.compose(NEMO_DIGIT ** (1, ...) + pynini.cross('-х', ''), final_graph), NEMO_SIGMA + pynini.accep("х\"") + NEMO_SIGMA, ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) graph = pynini.Far( get_abs_path("data/numbers/cardinal_number_name.far")).get_fst() self.graph_hundred_component_at_least_one_none_zero_digit = ( pynini.closure(NEMO_DIGIT, 2, 3) | pynini.difference(NEMO_DIGIT, pynini.accep("0"))) @ graph self.graph = (pynini.closure(NEMO_DIGIT, 1, 3) + pynini.closure( pynini.closure(pynutil.delete(","), 0, 1) + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT)) @ graph graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) single_digits_graph = pynutil.add_weight( pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( pynini.cross("0", "oh"), 1.1) self.single_digits_graph = single_digits_graph + pynini.closure( pynutil.insert(" ") + single_digits_graph) if not deterministic: single_digits_graph_with_commas = pynini.closure( self.single_digits_graph + pynutil.insert(" "), 1, 3) + pynini.closure( pynutil.delete(",") + single_digits_graph + pynutil.insert(" ") + single_digits_graph + pynutil.insert(" ") + single_digits_graph, 1, ) self.graph |= self.single_digits_graph | get_hundreds_graph( ) | single_digits_graph_with_commas self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph) self.range_graph |= self.graph + (pynini.cross( "x", " by ") | pynini.cross(" x ", " by ")) + self.graph self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) if not deterministic: final_graph |= self.range_graph final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="tokenize_and_classify", kind="classify") cardinal_graph_fst = CardinalFst() cardinal = cardinal_graph_fst.fst ordinal_graph_fst = OrdinalFst(cardinal_graph_fst) ordinal = ordinal_graph_fst.fst decimal_graph_fst = DecimalFst(cardinal_graph_fst) decimal = decimal_graph_fst.fst measure = MeasureFst(cardinal_graph_fst, decimal_graph_fst).fst date = DateFst(ordinal_graph_fst).fst word = WordFst().fst time = TimeFst().fst money = MoneyFst(cardinal_graph_fst, decimal_graph_fst).fst whitelist = WhiteListFst().fst graph = (pynutil.add_weight(whitelist, 1.01) | pynutil.add_weight(time, 1.1) | pynutil.add_weight(date, 1.09) | pynutil.add_weight(decimal, 1.1) | pynutil.add_weight(measure, 1.1) | pynutil.add_weight(cardinal, 1.1) | pynutil.add_weight(ordinal, 1.1) | pynutil.add_weight(money, 1.1) | pynutil.add_weight(word, 100)) self.fst = graph.optimize()
def get_address_graph(self, cardinal): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: 2788 San Tomas Expy, Santa Clara, CA 95051 -> units: "address" cardinal { integer: "two seven eight eight San Tomas Expressway Santa Clara California nine five zero five one" } preserve_order: true """ ordinal_verbalizer = OrdinalVerbalizer().graph ordinal_tagger = OrdinalTagger(cardinal=cardinal).graph ordinal_num = pynini.compose( pynutil.insert("integer: \"") + ordinal_tagger + pynutil.insert("\""), ordinal_verbalizer) address_num = pynini.closure(NEMO_DIGIT, 1) @ cardinal.single_digits_graph direction = (pynini.cross("E", "East") | pynini.cross("S", "South") | pynini.cross("W", "West") | pynini.cross("N", "North")) direction = pynini.closure( pynutil.add_weight(pynini.accep(NEMO_SPACE) + direction, -1), 0, 1) address_words = pynini.string_file( get_abs_path("data/address/address_words.tsv")) address_words = (pynini.accep(NEMO_SPACE) + pynini.closure(ordinal_num, 0, 1) + pynini.closure(NEMO_ALPHA | NEMO_SPACE, 1) + address_words) city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + city, 0, 1) state = pynini.invert( pynini.string_file(get_abs_path("data/address/states.tsv"))) state = pynini.closure( pynini.cross(",", "") + pynini.accep(NEMO_SPACE) + state, 0, 1) zip_code = pynini.compose(NEMO_DIGIT**5, cardinal.single_digits_graph) zip_code = pynini.closure( pynutil.add_weight( pynini.closure(pynini.cross(",", ""), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, -100), 0, 1, ) address = (address_num + direction + address_words + pynini.closure(pynini.cross(".", ""), 0, 1) + city + state + zip_code) return address
def __init__(self, cardinal: GraphFst, deterministic: bool = False): super().__init__(name="decimal", kind="classify", deterministic=deterministic) integer_part = cardinal.cardinal_numbers_default cardinal_numbers_with_leading_zeros = cardinal.cardinal_numbers_with_leading_zeros delimiter_map = prepare_labels_for_insertion( get_abs_path("data/numbers/decimal_delimiter.tsv")) delimiter = ( pynini.cross(",", "") + delimiter_map['@@decimal_delimiter@@'] + pynini.closure(pynutil.add_weight(pynutil.insert(" и"), 0.5), 0, 1)).optimize() decimal_endings_map = prepare_labels_for_insertion( get_abs_path("data/numbers/decimal_endings.tsv")) self.integer_part = integer_part + delimiter graph_integer = pynutil.insert( "integer_part: \"") + self.integer_part + pynutil.insert("\"") graph_fractional = NEMO_DIGIT @ cardinal_numbers_with_leading_zeros + decimal_endings_map[ '10'] graph_fractional |= ( NEMO_DIGIT + NEMO_DIGIT ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['100'] graph_fractional |= ( NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['1000'] graph_fractional |= ( NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT ) @ cardinal_numbers_with_leading_zeros + decimal_endings_map['10000'] self.optional_quantity = pynini.string_file( get_abs_path("data/numbers/quantity.tsv")).optimize() self.graph_fractional = graph_fractional graph_fractional = pynutil.insert( "fractional_part: \"") + graph_fractional + pynutil.insert("\"") optional_quantity = pynini.closure( (pynutil.add_weight(pynini.accep(NEMO_SPACE), -0.1) | insert_space) + pynutil.insert("quantity: \"") + self.optional_quantity + pynutil.insert("\""), 0, 1, ) self.final_graph = (cardinal.optional_graph_negative + graph_integer + insert_space + graph_fractional + optional_quantity) self.final_graph = self.add_tokens(self.final_graph) self.fst = self.final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit = get_singulars(unit_singular) | unit_singular graph_unit = pynutil.insert("currency: \"") + convert_space( unit) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # elf euro (und) vier cent, vier cent cents_standalone = (pynutil.insert("fractional_part: \"") + (pynutil.add_weight(cardinal_graph, -0.7) @ add_leading_zero_to_double_digit) + delete_space + pynutil.delete("cent") + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("und") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # elf euro vierzig, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + delete_extra_space + graph_unit + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit graph_decimal |= pynutil.insert( "currency: \"€\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative graph_half = pynini.cross("rưỡi", "5") unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + (pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) | graph_half) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + optional_cents_suffix) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_singular + optional_cents_suffix final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) graph_digit = pynini.string_file( get_abs_path("data/ordinals/digit.tsv")).invert() graph_teens = pynini.string_file( get_abs_path("data/ordinals/teen.tsv")).invert() graph = (pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) convert_rest = pynutil.insert("th", weight=0.01) suffix = pynini.cdrewrite( graph_digit | graph_teens | pynutil.add_weight(pynini.cross("ty", "tieth"), weight=0.001) | convert_rest, "", "[EOS]", NEMO_SIGMA, ).optimize() self.graph = pynini.compose(graph, suffix) self.suffix = suffix delete_tokens = self.delete_tokens(self.graph) self.fst = delete_tokens.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) add_separator = pynutil.insert(",") # between components optional_country_code = pynini.closure( pynutil.delete("country_code: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_space + add_separator + insert_space, 0, 1, ) number_part = ( pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynini.closure( pynutil.add_weight(pynutil.delete(" "), -0.1), 0, 1) + pynutil.delete("\"")) optional_extension = pynini.closure( delete_space + insert_space + pynutil.delete("extension: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\""), 0, 1, ) graph = optional_country_code + number_part + optional_extension delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { cardinal { integer: "c three two five b" } } """ alpha = NEMO_ALPHA if self.deterministic: num_graph = self.single_digits_graph else: num_graph = self.graph letter_pronunciation = pynini.string_map( load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha num_delimiter_num = pynini.closure(num_graph + delimiter, 1) + num_graph next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter | num_delimiter_num) + next_alpha_or_num if not self.deterministic: serial_graph += pynini.closure( pynini.accep("s") | pynini.cross("s", "es"), 0, 1) serial_graph.optimize() return pynutil.add_weight(serial_graph, 10)
def __init__(self): super().__init__(name="date", kind="verbalize") convert_primer = pynini.cross('1', '1ᵉʳ') day = ( pynutil.delete("day:") + delete_space + pynutil.delete("\"") + (pynini.closure(NEMO_NOT_QUOTE, 1) | pynutil.add_weight( convert_primer, -1)) # first of the month is ordinal + pynutil.delete("\"")) month = (pynutil.delete("month:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) year = (pynutil.delete("year:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")) # day month graph_dm = day + delete_extra_space + month graph_dmy = graph_dm + delete_extra_space + year optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") + delete_space + pynutil.delete("\"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + delete_space) final_graph = (graph_dm | graph_dmy) + delete_space + optional_preserve_order delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file): whitelist = load_labels(file) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist.tsv")) if not deterministic and input_case != "lower_cased": graph |= pynutil.add_weight( _get_whitelist_graph("lower_cased", get_abs_path("data/whitelist.tsv")), weight=0.0001 ) if input_file: whitelist_provided = _get_whitelist_graph(input_case, input_file) if not deterministic: graph |= whitelist_provided else: graph = whitelist_provided if not deterministic: units_graph = _get_whitelist_graph(input_case, file=get_abs_path("data/measure/measurements.tsv")) graph |= units_graph self.graph = graph self.final_graph = convert_space(self.graph).optimize() self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # add_space_between_chars = pynini.cdrewrite(pynini.closure(insert_space, 0, 1), NEMO_CHAR, NEMO_CHAR, NEMO_SIGMA) optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ")) graph = (tn_cardinal_tagger.graph @ optional_delete_space).invert().optimize() self.graph_hundred_component_at_least_one_none_zero_digit = ( (tn_cardinal_tagger.graph_hundred_component_at_least_one_none_zero_digit @ optional_delete_space) .invert() .optimize() ) self.graph_ties = (tn_cardinal_tagger.two_digit_non_zero @ optional_delete_space).invert().optimize() # this is to make sure if there is an ambiguity with decimal, decimal is chosen, e.g. 1000000 vs. 1 million graph = pynutil.add_weight(graph, weight=0.001) self.graph_no_exception = graph self.digit = pynini.arcmap(tn_cardinal_tagger.digit, map_type="rmweight").invert().optimize() graph_exception = pynini.project(self.digit, 'input') self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph self.optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus ", "\"-\" "), 0, 1 ) final_graph = self.optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") cardinal_graph = cardinal.graph_no_exception year_graph = _get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") day_graph = pynutil.insert("day: \"") + cardinal_graph + pynutil.insert("\"") optional_day = pynini.closure(pynutil.delete(pynini.union("ngày", "mùng") + delete_space), 0, 1) graph_year = ( delete_extra_space + pynutil.delete("năm") + delete_extra_space + pynutil.insert("year: \"") + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert("\"") ) optional_graph_year = pynini.closure(graph_year, 0, 1) graph_mdy = ( pynutil.delete("tháng") + delete_space + month_graph + ( (delete_space + pynutil.delete("ngày") + delete_extra_space + day_graph + optional_graph_year) | optional_graph_year ) ) graph_dmy = ( optional_day + day_graph + delete_space + pynutil.delete("tháng") + delete_extra_space + month_graph + optional_graph_year ) graph_year = ( pynutil.delete("năm") + delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") ) final_graph = pynini.union((graph_dmy | graph_year) + pynutil.insert(" preserve_order: true"), graph_mdy) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time_zone.tsv"))) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.insert("00") | oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) final_graph = ((graph_hm | graph_mh) + final_suffix_optional + final_time_zone_optional).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) month = pynini.closure(NEMO_NOT_QUOTE, 1) day_cardinal = ( pynutil.delete("day:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) day = day_cardinal @ ordinal.suffix if not deterministic: day |= day_cardinal month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"") year = ( pynutil.delete("year:") + delete_space + pynutil.delete("\"") + pynini.closure(NEMO_NOT_QUOTE, 1) + delete_space + pynutil.delete("\"") ) # month (day) year graph_mdy = ( month + pynini.closure(delete_extra_space + day, 0, 1) + pynini.closure(delete_extra_space + year, 0, 1) ) # day month year graph_dmy = ( pynutil.insert("the ") + day + delete_extra_space + pynutil.insert("of ") + month + pynini.closure(delete_extra_space + year, 0, 1) ) optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") + delete_space + pynutil.delete("\"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + delete_space ) final_graph = ( (graph_mdy | year | pynutil.add_weight(graph_dmy, 0.001)) + delete_space + optional_preserve_order ) delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize()
def get_serial_graph(self): """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = self.single_digits_graph if not self.deterministic: num_graph |= self.graph # add space between letter and digit graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA, NEMO_DIGIT, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), NEMO_DIGIT, NEMO_ALPHA, NEMO_SIGMA), ) # make sure at least one digit and letter is present not_space = pynini.closure(NEMO_NOT_SPACE) graph_with_space = pynini.compose( (not_space + NEMO_ALPHA + not_space + NEMO_DIGIT + not_space) | (not_space + NEMO_DIGIT + not_space + NEMO_ALPHA + not_space), graph_with_space, ) keep_space = pynini.accep(" ") serial_graph = pynini.compose( graph_with_space, pynini.closure(pynini.closure(NEMO_ALPHA, 1) + keep_space, 1) + num_graph + pynini.closure(keep_space + pynini.closure(NEMO_ALPHA) + pynini.closure(keep_space + num_graph, 0, 1)), ) serial_graph |= pynini.compose( graph_with_space, num_graph + keep_space + pynini.closure(NEMO_ALPHA, 1) + pynini.closure(keep_space + num_graph + pynini.closure( keep_space + pynini.closure(NEMO_ALPHA), 0, 1)), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure(delimiter + num_graph + pynutil.insert(" ") + alphas) serial_graph |= letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) return pynutil.add_weight(serial_graph, 2)
def __init__(self, cardinal: GraphFst): super().__init__(name="time", kind="classify") suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.string_file( get_abs_path("data/time_zone.tsv")) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(cardinal.graph, weight=-0.7) labels_hour = [str(x) for x in range(0, 24)] labels_minute_single = [str(x) for x in range(1, 10)] labels_minute_double = [str(x) for x in range(10, 60)] delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT) graph_hour = delete_leading_zero_to_double_digit @ pynini.union( *labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynini.cross("0", "o") + insert_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # 2:30 pm, 02:30, 2:00 graph_hm = ( final_graph_hour + pynutil.delete(":") + (pynutil.delete("00") | insert_space + final_graph_minute) + final_suffix_optional + final_time_zone_optional) # 2.xx pm/am graph_hm2 = ( final_graph_hour + pynutil.delete(".") + (pynutil.delete("00") | insert_space + final_graph_minute) + delete_space + insert_space + final_suffix + final_time_zone_optional) # 2 pm est graph_h = final_graph_hour + delete_space + insert_space + final_suffix + final_time_zone_optional final_graph = (graph_hm | graph_h | graph_hm2).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) add_separator = pynutil.insert(", ") # between components digit = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", "o") country_code = (pynutil.insert("country_code: \"") + pynini.closure(pynutil.delete("+"), 0, 1) + pynini.closure(digit + insert_space, 0, 2) + digit + pynutil.insert("\"")) optional_country_code = pynini.closure( country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space, 0, 1) area_part_common = pynutil.add_weight( pynini.cross("800", "eight hundred"), -1.1) area_part_default = pynini.closure(digit + insert_space, 2, 2) + digit area_part = area_part_default | area_part_common area_part = ( (area_part + pynutil.delete("-")) | (pynutil.delete("(") + area_part + (pynutil.delete(") ") | pynutil.delete(")-")))) + add_separator del_separator = pynini.closure(pynini.union("-", " "), 0, 1) number_length = ((NEMO_DIGIT + del_separator) | (NEMO_ALPHA + del_separator))**7 number_words = pynini.closure((NEMO_DIGIT @ digit) + (insert_space | pynini.cross("-", ', ')) | NEMO_ALPHA | (NEMO_ALPHA + pynini.cross("-", ' '))) number_words = pynini.compose(number_length, number_words) number_part = area_part + number_words number_part = pynutil.insert( "number_part: \"") + number_part + pynutil.insert("\"") extension = (pynutil.insert("extension: \"") + pynini.closure(digit + insert_space, 0, 3) + digit + pynutil.insert("\"")) optional_extension = pynini.closure(insert_space + extension, 0, 1) graph = optional_country_code + number_part + optional_extension # ip digit_to_str_graph = pynini.compose( NEMO_DIGIT**(1, 3), digit + pynini.closure(pynutil.insert(" ") + digit)).optimize() ip_graph = digit_to_str_graph + (pynini.cross(".", " dot ") + digit_to_str_graph)**3 graph |= pynutil.insert( "number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()