def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_hundred: 'pynini.FstLike', include_abbr: bool) -> 'pynini.FstLike': """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. 1 million -> integer_part: "one" quantity: "million" e.g. 1.5 million -> integer_part: "one" fractional_part: "five" quantity: "million" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ quantity_wo_thousand = pynini.project(quantities, "input") - pynini.union( "k", "K", "thousand") if include_abbr: quantity_wo_thousand |= pynini.project( quantities_abbr, "input") - pynini.union("k", "K", "thousand") res = (pynutil.insert("integer_part: \"") + cardinal_up_to_hundred + pynutil.insert("\"") + pynini.closure(pynutil.delete(" "), 0, 1) + pynutil.insert(" quantity: \"") + (quantity_wo_thousand @ (quantities | quantities_abbr)) + pynutil.insert("\"")) if include_abbr: quantity = quantities | quantities_abbr else: quantity = quantities res |= (decimal + pynini.closure(pynutil.delete(" "), 0, 1) + pynutil.insert("quantity: \"") + quantity + pynutil.insert("\"")) return res
def get_serial_graph(self): """ Finite state transducer for classifying serial. The serial is a combination of digits, letters and dashes, e.g.: c325-b -> tokens { cardinal { integer: "си три два пять би" } } """ num_graph = self.single_digits_graph alpha = TO_CYRILLIC | RU_ALPHA delimiter = insert_space | pynini.cross("-", " ") | pynini.cross( "/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha num_delimiter_num = pynini.closure(num_graph + delimiter, 1) + num_graph next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) serial_graph = (letter_num | num_letter | num_delimiter_num) + next_alpha_or_num # at least 1 alpha and 1 digit is present at_least_one_alpha_num = ( NEMO_SIGMA + (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) + NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA) | ( NEMO_SIGMA + NEMO_DIGIT + NEMO_SIGMA + (RU_ALPHA | pynini.project(TO_CYRILLIC, "input")) + NEMO_SIGMA) serial_graph = pynini.compose(at_least_one_alpha_num, serial_graph.optimize()).optimize() # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)).optimize() return serial_graph.optimize()
def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) # add_space_between_chars = pynini.cdrewrite(pynini.closure(insert_space, 0, 1), NEMO_CHAR, NEMO_CHAR, NEMO_SIGMA) optional_delete_space = pynini.closure(NEMO_SIGMA | pynutil.delete(" ")) graph = (tn_cardinal_tagger.graph @ optional_delete_space).invert().optimize() self.graph_hundred_component_at_least_one_none_zero_digit = ( (tn_cardinal_tagger.graph_hundred_component_at_least_one_none_zero_digit @ optional_delete_space) .invert() .optimize() ) self.graph_ties = (tn_cardinal_tagger.two_digit_non_zero @ optional_delete_space).invert().optimize() # this is to make sure if there is an ambiguity with decimal, decimal is chosen, e.g. 1000000 vs. 1 million graph = pynutil.add_weight(graph, weight=0.001) self.graph_no_exception = graph self.digit = pynini.arcmap(tn_cardinal_tagger.digit, map_type="rmweight").invert().optimize() graph_exception = pynini.project(self.digit, 'input') self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph self.optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus ", "\"-\" "), 0, 1 ) final_graph = self.optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) dot = pynini.accep(".") # A.B.C. -> A. B. C. graph = NEMO_UPPER + dot + pynini.closure( insert_space + NEMO_UPPER + dot, 1) # A.B.C. -> A.B.C. graph |= NEMO_UPPER + dot + pynini.closure(NEMO_UPPER + dot, 1) # ABC -> ABC graph |= NEMO_UPPER + pynini.closure(NEMO_UPPER, 1) # ABC -> A B C graph |= NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, whitelist: 'pynini.FstLike', deterministic: bool = True): super().__init__(name="abbreviation", kind="classify", deterministic=deterministic) main_graph = NEMO_UPPER + pynini.closure(insert_space + NEMO_UPPER, 1) misc_graph = pynutil.add_weight( TO_LOWER + pynini.closure(insert_space + pynini.union(TO_LOWER | NEMO_LOWER)), 110) misc_graph |= pynutil.add_weight( pynini.closure(NEMO_UPPER, 2) + pynini.closure(insert_space + NEMO_LOWER, 1), 110) misc_graph |= ( NEMO_UPPER + pynutil.delete(".") + pynini.closure(insert_space + NEMO_UPPER + pynutil.delete("."))) misc_graph |= pynutil.add_weight( TO_LOWER + pynutil.delete(".") + pynini.closure(insert_space + TO_LOWER + pynutil.delete(".")), 110) # set weight of the misc graph to the value higher then word graph = pynutil.add_weight(main_graph.optimize(), 10) | pynutil.add_weight( misc_graph.optimize(), 101) # exclude words that are included in the whitelist graph = pynini.compose( pynini.difference(pynini.project(graph, "input"), pynini.project(whitelist.graph, "input")), graph) graph = pynutil.insert( "value: \"") + graph.optimize() + pynutil.insert("\"") graph = self.add_tokens(graph) self.fst = graph.optimize()
def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="whitelist", kind="classify", deterministic=deterministic) def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [[x[0].lower()] + x[1:] for x in whitelist] else: whitelist = [[x[0].lower()] + x[1:] for x in whitelist] graph = pynini.string_map(whitelist) return graph graph = _get_whitelist_graph(input_case) units_graph = _get_whitelist_graph(input_case, file="data/measurements.tsv") # do not replace single letter units, like `м` or `°` units_graph = pynini.compose( pynini.difference(pynini.project(units_graph, "input"), NEMO_ALPHA), units_graph) graph |= units_graph.optimize() graph |= TO_LATIN + pynini.closure(pynutil.insert(" ") + TO_LATIN) self.final_graph = convert_space(graph) self.fst = (pynutil.insert("name: \"") + self.final_graph + pynutil.insert("\"")).optimize()
def _make_sigma_star(self) -> pynini.Fst: r"""Convenience function generating \Sigma^* including feature labels. Returns: A \Sigma^* transducer. """ feature_labels = pynini.project(self._feature_mapper, "input") return pynini.union(byte.BYTE, feature_labels).closure().optimize()
def get_one_to_one_thousand(cardinal: 'pynini.FstLike') -> 'pynini.FstLike': """ Produces an acceptor for verbalizations of all numbers from 1 to 1000. Needed for ordinals and fractions. Args: cardinal: CardinalFst Returns: fst: A pynini.FstLike object """ numbers = pynini.string_map([str(_) for _ in range(1, 1000)]) @ cardinal return pynini.project(numbers, "output").optimize()
def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": """ Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, e.g. một triệu -> integer_part: "1" quantity: "triệu" e.g. một tỷ rưỡi -> integer_part: "1" fractional_part: "5" quantity: "tỷ" Args: decimal: decimal FST cardinal_up_to_hundred: cardinal FST """ numbers = cardinal_up_to_hundred @ (pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)) suffix = pynini.union("triệu", "tỉ", "tỷ", "vạn") graph_four = pynini.cross("tư", "4") graph_one = pynini.cross("mốt", "1") graph_half = pynini.cross("rưỡi", "5") last_digit_exception = pynini.project(pynini.cross("năm", "5"), "input") last_digit = pynini.union( (pynini.project(graph_digit, "input") - last_digit_exception.arcsort()) @ graph_digit, graph_one, graph_four, graph_half, ) optional_fraction_graph = pynini.closure( delete_extra_space + pynutil.insert('fractional_part: "') + (last_digit | graph_half | graph_one | graph_four) + pynutil.insert('"'), 0, 1, ) res = (pynutil.insert('integer_part: "') + numbers + pynutil.insert('"') + delete_extra_space + pynutil.insert('quantity: "') + suffix + pynutil.insert('"') + optional_fraction_graph) res |= (decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "ngàn" | "nghìn") + pynutil.insert('"')) return res
def __init__(self): super().__init__(name="telephone", kind="classify") # country code, number_part, extension digit_to_str = pynini.invert( pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero")) double_digit = pynini.union( *[ pynini.cross( pynini.project(str(i) @ digit_to_str, "output") + pynini.accep(" ") + pynini.project(str(i) @ digit_to_str, "output"), pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"), ) for i in range(10) ] ) double_digit.invert() number_part = ( pynini.closure(digit_to_str + insert_space, 2, 2) + digit_to_str + pynutil.delete("-") + insert_space + pynini.closure(digit_to_str + insert_space, 2, 2) + digit_to_str + pynutil.delete("-") + insert_space + pynini.closure(digit_to_str + insert_space, 3, 3) + digit_to_str ) number_part = ( pynutil.insert("number_part: \"") + pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part) + pynutil.insert("\"") ) graph = number_part final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self, itn_cardinal_tagger: GraphFst, itn_decimal_tagger: GraphFst, deterministic: bool = True): super().__init__(name="money", kind="classify", deterministic=deterministic) cardinal_graph = (pynini.cdrewrite( pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]", "[EOS]", NEMO_SIGMA) @ itn_cardinal_tagger.graph_no_exception) graph_decimal_final = itn_decimal_tagger.final_graph_wo_negative graph_unit = pynini.invert(maj_singular) graph_unit = pynutil.insert("currency: \"") + convert_space( graph_unit) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) min_unit = pynini.project(min_singular | min_plural, "output") # elf euro (und) vier cent, vier cent cents_standalone = (pynutil.insert("fractional_part: \"") + cardinal_graph @ add_leading_zero_to_double_digit + delete_space + pynutil.delete(min_unit) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("und") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # elf euro vierzig, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + delete_extra_space + graph_unit + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit graph_decimal |= pynutil.insert( "currency: \"€\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") cardinal_graph = cardinal.graph_no_exception year_graph = _get_year_graph() YEAR_WEIGHT = 0.001 year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT) month_graph = _get_month_graph() month_graph = pynutil.insert( 'month: "') + month_graph + pynutil.insert('"') month_exception = pynini.project(pynini.cross("năm", "5"), "input") month_graph_exception = (pynini.project(month_graph, "input") - month_exception.arcsort()) @ month_graph day_graph = pynutil.insert('day: "') + cardinal_graph + pynutil.insert( '"') # day_suffix = pynini.union("ngày", "mùng") # optional_day = pynini.closure(day_suffix + delete_space, 0, 1) graph_month = pynutil.delete( "tháng") + delete_space + month_graph_exception graph_year = (delete_extra_space + pynutil.delete("năm") + delete_extra_space + pynutil.insert('year: "') + pynutil.add_weight(year_graph, -YEAR_WEIGHT) + pynutil.insert('"')) optional_graph_year = pynini.closure(graph_year, 0, 1) graph_my = pynutil.delete( "tháng") + delete_space + month_graph + graph_year graph_dmy = (day_graph + delete_space + pynutil.delete("tháng") + delete_extra_space + month_graph + optional_graph_year) graph_year = (pynutil.delete("năm") + delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"')) final_graph = (graph_dmy | graph_my | graph_month | graph_year) + pynutil.insert(" preserve_order: true") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def Tester(stream, far_reader): """Tests rules against inputs, checking outputs. Args: stream: input stream far_reader: a far reader object Returns: None """ lineno = -1 success = True for line in stream: lineno += 1 fields = line.strip('\n').split('\t') if len(fields) != 3: sys.stderr.write('Skipping line %d (wrong number of fields)\n' % lineno) continue rules, input_, output = fields input_fst = input_ rule_failure = False for rule in rules.split(','): try: rule_fst = far_reader[rule] except KeyError: sys.stderr.write('Warning: cannot find rule %s, line %d\n' % (rule, lineno)) rule_failure = True break input_fst = input_fst * rule_fst if rule_failure: success = False continue ofst = pynini.shortestpath(pynini.project(input_fst, True)) it = pynini.StringPaths(ofst) pred = '' while not it.done(): ## Accepts the first string pred = it.istring() break if pred != output: success = False sys.stderr.write('Line %d: input and output do not match for\n' ' Rules:\t%s\n' ' Input:\t%s\nExpected:\t%s\n Actual:\t%s\n' % (lineno, rules, input_, output, pred)) if success: sys.stderr.write('All tests pass!!\n') else: sys.stderr.write('Some rewrites failed\n')
def core_visual_norm_fsts(rewrite_file: os.PathLike, preserve_file: os.PathLike, consonant_file: os.PathLike, sigma: pynini.Fst) -> List[pynini.Fst]: """Creates a visual normalization FST. Given a rewrite file, preserve file, and consonant file, returns an FST that will perform the rewrites described in the StringFile `rewrite_file`, additionally clearing out instances of ZWJ, ZWNJ, and ZWS except for those that match preserve_file when occurring between consonants (which are specified in the consonants file). Args: rewrite_file: Path relative to the runfiles directory of a StringFile of visual rewrites. preserve_file: Path relative to the runfiles directory of a StringFile of ZWJ sequences to preserve. consonant_file: Path relative to the runfiles directory of a StringFile containing a native--latin consonant mapping. sigma: An Fst with which to consider the complete alphabet for cdrewrites. Returns: Visual normalization FST. """ rewrite_fst = rule.fst_from_rule_file(rewrite_file, sigma) preserve = uf.StringFile(preserve_file) consonant_map = uf.StringFile(consonant_file) consonant = pynini.project(consonant_map, 'input') # This makes sure that the generated symbols used as implementation # detail symbols for ZWJ preservation are considered as part of sigma. # Generated symbols are those delimited by square brackets, such as # `[ZWJ,VIRAMA]` for example. intermediate_sigma = u.BuildSigmaFstFromSymbolTable( pynini.generated_symbols()).union(sigma) mark_preserve = ur.Rewrite(preserve, intermediate_sigma, consonant, consonant) clean_joiner = ur.Rewrite( pynutil.delete(pynini.union(uc.ZWNJ, uc.ZWJ, uc.ZWS)), intermediate_sigma) reinstate = ur.Rewrite(pynini.invert(preserve), intermediate_sigma) return [rewrite_fst, mark_preserve, clean_joiner, reinstate, # We right-compose with sigma.star to ensure the generated_symbols # don't leak through into the visual_norm fst. sigma.star]
def _assert_fst_sampled_behavior( self, fsts: List[pynini.Fst], token_type: pynini.TokenType, samples: int, assert_function: Callable[[pynini.Fst, pynini.Fst], None]) -> None: """Asserts that FST composed on samples is follow a specific behavior. This samples from first FST's input projection in order to assert a behavior when composed with the FSTs. This is used in lieu of statically verifying that this composition has a specific property as that isn't easy to answer for non-deterministic FSTs. If token_type is set to "byte", then the input projection of the FST is intersected with the definition of the closure over valid UTF-8 characters to ensure all samples are valid UTF-8 strings that Python can handle. The maximum length of a sample is set to 100 labels. Args: fsts: List of FSTs to be applied on a sample to verify if the resultant FST obeys the property specified in the function. token_type: The token_type used to derive the FST. samples: The number of input samples to take to verify functionality. assert_function: An assert function with input string FSA and output FST as parameters. This function is run in `pynini.default_token_type` environment. This function raises AssertionError on assert failure. """ input_language = pynini.project(fsts[0], "input") if token_type == "byte": # NOTE: Randgenning directly from the byte machine is bound to lead to # trouble since it can generate things that aren't well-formed UTF-8 # sequences and thus cannot be put into a Python str type. input_language = pynini.intersect(input_language, utf8.VALID_UTF8_CHAR.star) input_samples = pynini.randgen(input_language, npath=samples, max_length=_MAX_SAMPLE_LENGTH) with pynini.default_token_type(token_type): for ilabels in _olabels_iter(input_samples): input_str_fsa = _label_list_to_string_fsa(ilabels) output_fst = rewrite.ComposeFsts([input_str_fsa] + fsts) assert_function(input_str_fsa, output_fst)
def generator_main(exporter_map: multi_grm.ExporterMapping): """Generates FSTs for visual normalization of Brahmic scripts.""" for token_type in ('byte', 'utf8'): rewrite_map = {} with pynini.default_token_type(token_type): sigma_map = {} for script in u.SCRIPTS: sigma = u.OpenSigma(script, token_type) sigma_map[script] = sigma dedup = cu.dedup_marks_fst(script, sigma) nfc = open_nfc(script, token_type) rewrite_map[script] = ur.ComposeFsts( [nfc, dedup] + core_visual_norm_fsts( u.SCRIPT_DIR / script / 'visual_rewrite.tsv', u.SCRIPT_DIR / script / 'preserve.tsv', u.SCRIPT_DIR / script / 'consonant.tsv', sigma)) for script, langs in u.LANG_SCRIPT_MAP.items(): for lang in langs: sigma = sigma_map[script] consonant_map = uf.StringFile(u.SCRIPT_DIR / script / 'consonant.tsv') consonant = pynini.project(consonant_map, 'input') before_cons = uf.StringFile( u.SCRIPT_DIR / script / lang / 'before_consonant.tsv') rewrite_before_cons = ur.Rewrite(before_cons, sigma, right=consonant) after_cons = uf.StringFile( u.SCRIPT_DIR / script / lang / 'after_consonant.tsv') rewrite_after_cons = ur.Rewrite(after_cons, sigma, left=consonant) rewrite_map[lang] = ur.ComposeFsts([ rewrite_map[script], rewrite_before_cons, rewrite_after_cons]) exporter = exporter_map[token_type] for name, fst in rewrite_map.items(): exporter[name.upper()] = fst
def __init__(self, deterministic: bool = True): super().__init__(name="electronic", kind="classify", deterministic=deterministic) accepted_symbols = pynini.project( pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") accepted_common_domains = pynini.project( pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input") all_accepted_symbols = NEMO_ALPHA + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols) graph_symbols = pynini.string_file( get_abs_path("data/electronic/symbol.tsv")).optimize() username = (pynutil.insert("username: \"") + all_accepted_symbols + pynutil.insert("\"") + pynini.cross('@', ' ')) domain_graph = all_accepted_symbols + pynini.accep( '.') + all_accepted_symbols protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "semicolon")) + pynutil.insert(" ")) protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross( "http", "HTTP ")) + (pynini.accep("://") @ protocol_symbols) protocol_file_start = pynini.accep("file") + insert_space + ( pynini.accep(":///") @ protocol_symbols) protocol_end = pynini.cross( "www", "WWW ") + pynini.accep(".") @ protocol_symbols protocol = protocol_file_start | protocol_start | protocol_end | ( protocol_start + protocol_end) domain_graph = ( pynutil.insert("domain: \"") + pynini.difference(domain_graph, pynini.project(protocol, "input") + NEMO_SIGMA) + pynutil.insert("\"")) domain_common_graph = ( pynutil.insert("domain: \"") + pynini.difference( all_accepted_symbols + accepted_common_domains + pynini.closure( accepted_symbols + pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols), 0, 1), pynini.project(protocol, "input") + NEMO_SIGMA, ) + pynutil.insert("\"")) protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert( "\"") # email graph = username + domain_graph # abc.com, abc.com/123-sm graph |= domain_common_graph # www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad graph |= protocol + pynutil.insert(" ") + domain_graph final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
GraphFst, delete_preserve_order, ) from nemo_text_processing.text_normalization.es.graph_utils import shift_cardinal_gender, strip_cardinal_apocope from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil fem = pynini.string_file( (get_abs_path("data/money/currency_plural_fem.tsv"))) masc = pynini.string_file( (get_abs_path("data/money/currency_plural_masc.tsv"))) fem_singular = pynini.project(fem, "input") masc_singular = pynini.project(masc, "input") fem_plural = pynini.project(fem, "output") masc_plural = pynini.project(masc, "output") PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): fem_plural = None masc_plural = None fem_singular = None masc_singular = None PYNINI_AVAILABLE = False
delete_extra_space, delete_preserve_order, ) from nemo_text_processing.text_normalization.es.graph_utils import ones from nemo_text_processing.text_normalization.es.utils import get_abs_path try: import pynini from pynini.lib import pynutil unit_plural_fem = pynini.string_file( get_abs_path("data/measures/measurements_plural_fem.tsv")) unit_plural_masc = pynini.string_file( get_abs_path("data/measures/measurements_plural_masc.tsv")) unit_singular_fem = pynini.project(unit_plural_fem, "input") unit_singular_masc = pynini.project(unit_plural_masc, "input") unit_plural_fem = pynini.project(unit_plural_fem, "output") unit_plural_masc = pynini.project(unit_plural_masc, "output") PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): unit_plural_fem = None unit_plural_masc = None unit_singular_fem = None unit_singular_masc = None PYNINI_AVAILABLE = False
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_twenties = pynini.string_file( get_abs_path("data/numbers/twenties.tsv")) graph_hundreds = pynini.string_file( get_abs_path("data/numbers/hundreds.tsv")) graph_hundred_component = graph_hundreds | pynutil.insert("0") graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_twenties | graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' pynutil.insert("000", weight=0.1), ) graph_millones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("millones") | pynutil.delete("millón")), pynutil.insert("000") + pynutil.delete("millones"), # to allow for 'mil millones' ) graph_mil_millones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' ) graph_mil_millones += delete_space + ( graph_millones | pynutil.insert("000") + pynutil.delete("millones") ) # allow for 'mil millones' graph_mil_millones |= pynutil.insert("000000", weight=0.1) # also allow 'millardo' instead of 'mil millones' graph_millardo = ( graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("millardo") | pynutil.delete("millardos"))) graph_billones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("billones") | pynutil.delete("billón")), ) graph_mil_billones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' ) graph_mil_billones += delete_space + ( graph_billones | pynutil.insert("000") + pynutil.delete("billones") ) # allow for 'mil billones' graph_mil_billones |= pynutil.insert("000000", weight=0.1) graph_trillones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("trillones") | pynutil.delete("trillón")), ) graph_mil_trillones = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'un mil' ) graph_mil_trillones += delete_space + ( graph_trillones | pynutil.insert("000") + pynutil.delete("trillones")) # allow for 'mil trillones' graph_mil_trillones |= pynutil.insert("000000", weight=0.1) graph = pynini.union( (graph_mil_trillones | pynutil.insert("000", weight=0.1) + graph_trillones) + delete_space + (graph_mil_billones | pynutil.insert("000", weight=0.1) + graph_billones) + delete_space + pynini.union( graph_mil_millones, pynutil.insert("000", weight=0.1) + graph_millones, graph_millardo + graph_millones, graph_millardo + pynutil.insert("000", weight=0.1), ) + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0") # ignore "y" inside cardinal numbers graph = ( pynini.cdrewrite(pynutil.delete("y"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph) self.graph_no_exception = graph # save self.numbers_up_to_thousand for use in DecimalFst digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose( graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = (NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6)) numbers_up_to_million = pynini.compose( graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input') self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("menos", "\"-\"") + NEMO_SPACE, 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_one = pynini.cross("mốt", "1") graph_four = pynini.cross("tư", "4") graph_five = pynini.cross("lăm", "5") graph_half = pynini.cross("rưỡi", "5") graph_hundred = pynini.cross("trăm", "") graph_ten = pynini.cross("mươi", "") zero = pynini.cross(pynini.union("linh", "lẻ"), "0") optional_ten = pynini.closure(delete_space + graph_ten, 0, 1) last_digit = graph_digit | graph_one | graph_four | graph_five graph_hundred_component = (graph_digit | graph_zero) + delete_space + graph_hundred graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen, graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")), (graph_half | graph_four | graph_one) + pynutil.insert("0"), zero + delete_space + (graph_digit | graph_four), graph_digit, pynutil.insert("00"), ) graph_hundred_component |= ( pynutil.insert("0") + delete_space + pynini.union( graph_teen, graph_ties + optional_ten + ((delete_space + last_digit) | pynutil.insert("0")), zero + delete_space + (graph_digit | graph_four), graph_digit, )) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")), pynutil.insert("000", weight=0.1), ) graph_ten_thousand = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("vạn"), pynutil.insert("0000", weight=0.1), ) graph_ten_thousand_suffix = pynini.union( graph_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")), pynutil.insert("0", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("triệu"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("tỉ", "tỷ")), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_ten_thousand + delete_space + graph_ten_thousand_suffix + delete_space + graph_hundred_component, graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete(pynini.union("nghìn", "ngàn")) + delete_space + ((last_digit + pynutil.insert("00")) | graph_hundred_component), graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0") # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input') self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross(pynini.union("âm", "trừ"), "\"-\"") + NEMO_SPACE, 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _input_string_file(filename: os.PathLike, return_if_empty: pynini.Fst = uf.EMPTY) -> pynini.Fst: fst = uf.StringFile(filename, return_if_empty) return pynini.project(fst, 'input').rmepsilon()
def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True, lm: bool = False): super().__init__(name="integer", kind="classify", deterministic=deterministic) """ Finite state transducer for classifying serial (handles only cases without delimiters, values with delimiters are handled by default). The serial is a combination of digits, letters and dashes, e.g.: c325b -> tokens { cardinal { integer: "c three two five b" } } """ num_graph = pynini.compose(NEMO_DIGIT**(6, ...), cardinal.single_digits_graph).optimize() num_graph |= pynini.compose(NEMO_DIGIT**(1, 5), cardinal.graph).optimize() # to handle numbers starting with zero num_graph |= pynini.compose( pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize() # TODO: "#" doesn't work from the file symbols_graph = pynini.string_file( get_abs_path("data/whitelist/symbol.tsv")).optimize( ) | pynini.cross("#", "hash") num_graph |= symbols_graph if not self.deterministic and not lm: num_graph |= cardinal.single_digits_graph # also allow double digits to be pronounced as integer in serial number num_graph |= pynutil.add_weight( NEMO_DIGIT**2 @ cardinal. graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001) # add space between letter and digit/symbol symbols = [ x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv")) ] symbols = pynini.union(*symbols) digit_symbol = NEMO_DIGIT | symbols graph_with_space = pynini.compose( pynini.cdrewrite(pynutil.insert(" "), NEMO_ALPHA | symbols, digit_symbol, NEMO_SIGMA), pynini.cdrewrite(pynutil.insert(" "), digit_symbol, NEMO_ALPHA | symbols, NEMO_SIGMA), ) # serial graph with delimiter delimiter = pynini.accep("-") | pynini.accep("/") | pynini.accep(" ") alphas = pynini.closure(NEMO_ALPHA, 1) letter_num = alphas + delimiter + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alphas next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph)) next_alpha_or_num |= pynini.closure( delimiter + num_graph + plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize() + alphas) serial_graph = letter_num + next_alpha_or_num serial_graph |= num_letter + next_alpha_or_num # numbers only with 2+ delimiters serial_graph |= (num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)) # 2+ symbols serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph) # exclude ordinal numbers from serial options serial_graph = pynini.compose( pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph).optimize() serial_graph = pynutil.add_weight(serial_graph, 0.0001) serial_graph |= (pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()) # at least one serial graph with alpha numeric value and optional additional serial/num/alpha values serial_graph = ( pynini.closure((serial_graph | num_graph | alphas) + delimiter) + serial_graph + pynini.closure(delimiter + (serial_graph | num_graph | alphas))) serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize() serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() self.graph = serial_graph.optimize() graph = pynutil.insert("name: \"") + convert_space( self.graph).optimize() + pynutil.insert("\"") self.fst = graph.optimize()
def _priority_union(q: pynini.Fst, r: pynini.Fst, sigma: pynini.Fst) -> pynini.Fst: complement_domain_q = sigma - pynini.project(q, "input") return pynini.union(q, complement_domain_q @ r)
def __init__(self, cardinal: GraphFst): super().__init__(name="telephone", kind="classify") # country code, number_part, extension digit_to_str = ( pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()) | pynini.cross("0", pynini.union("o", "oh", "zero")).optimize() ) str_to_digit = pynini.invert(digit_to_str) double_digit = pynini.union( *[ pynini.cross( pynini.project(str(i) @ digit_to_str, "output") + pynini.accep(" ") + pynini.project(str(i) @ digit_to_str, "output"), pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"), ) for i in range(10) ] ) double_digit.invert() # to handle cases like "one twenty three" two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2) double_digit_to_digit = ( pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal ) single_or_double_digit = (pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit).optimize() single_or_double_digit |= ( single_or_double_digit + pynini.closure(pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001)) ).optimize() number_part = pynini.compose( single_or_double_digit, NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4, ).optimize() number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"") cardinal_option = pynini.compose(single_or_double_digit, NEMO_DIGIT ** (2, 3)) country_code = ( pynutil.insert("country_code: \"") + pynini.closure(pynini.cross("plus ", "+"), 0, 1) + ((pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit) | cardinal_option) + pynutil.insert("\"") ) optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize() graph = optional_country_code + number_part # credit card number space_four_digits = insert_space + NEMO_DIGIT ** 4 credit_card_graph = pynini.compose(single_or_double_digit, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize() graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"") # SSN ssn_graph = pynini.compose( single_or_double_digit, NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4, ).optimize() graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"") # ip digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit digit_or_double |= double_digit_to_digit + pynini.closure(pynutil.delete(" ") + str_to_digit, 0, 1) digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit) ** (0, 2) digit_or_double |= cardinal_option digit_or_double = digit_or_double.optimize() ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3 graph |= pynutil.insert("number_part: \"") + ip_graph.optimize() + pynutil.insert("\"") graph |= ( pynutil.insert("number_part: \"") + pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001) + pynutil.insert("\"") ) final_graph = self.add_tokens(graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file( get_abs_path("data/numbers/digit.tsv")) graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_ties_unique = pynini.string_file( get_abs_path("data/numbers/ties_unique.tsv")) # Tens components graph_tens_component = graph_ties + ( (delete_hyphen + graph_digit) | pynutil.insert("0")) graph_tens_component = pynini.union(graph_tens_component, graph_teens, graph_ties_unique) graph_tens_component_with_leading_zeros = pynini.union( graph_tens_component, (pynutil.insert("0") + (graph_digit | pynutil.insert("0", weight=0.01)))) # Hundreds components graph_cent_singular = pynutil.delete("cent") # Used in hundreds place graph_cent_plural = pynini.cross( "cents", "00" ) # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201 graph_digit_no_one = pynini.project(pynini.union("un", "une"), 'input') graph_digit_no_one = (pynini.project(graph_digit, "input") - graph_digit_no_one.arcsort()) @ graph_digit graph_hundreds_component_singular = ( graph_digit_no_one + delete_hyphen + graph_cent_singular ) # Regular way: [1-9] * 100 graph_hundreds_component_singular = pynini.union( graph_hundreds_component_singular, pynini.cross("cent", "1")) graph_hundreds_component_singular += delete_hyphen graph_hundreds_component_singular += graph_tens_component_with_leading_zeros graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural graph_hundreds_component = pynini.union( graph_hundreds_component_singular, graph_hundreds_component_plural, pynutil.insert("0") + graph_tens_component_with_leading_zeros, ) graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite( graph_hundreds_component_at_least_one_none_zero_digit).optimize() # Graph thousands (we'll need this for cases of mille millions, mille milliards...) graph_tens_of_hundreds_component_singular = ( graph_tens_component + delete_hyphen + graph_cent_singular ) # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents" graph_tens_of_hundreds_component_singular += delete_hyphen + graph_tens_component_with_leading_zeros graph_tens_of_hundreds_component_plural = graph_tens_component + delete_hyphen + graph_cent_plural graph_tens_of_hundred_component = ( graph_tens_of_hundreds_component_plural | graph_tens_of_hundreds_component_singular) graph_thousands = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + pynutil.delete("mille"), pynutil.insert("001") + pynutil.delete("mille"), # because 'mille', not 'un mille' pynutil.insert("000", weight=0.1), ) # All other large amounts graph_millions = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("million") | pynutil.delete("millions")), pynutil.insert("000", weight=0.1), ) graph_milliards = pynini.union( # French for English 'billion' graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("milliard") | pynutil.delete("milliards")), pynutil.insert("000", weight=0.1), ) graph_billions = pynini.union( # NOTE: this is English 'trillion.' graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("billions") | pynutil.delete("billion")), pynutil.insert("000", weight=0.1), ) graph_mille_billion = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + pynutil.delete("mille"), pynutil.insert("001") + pynutil.delete("mille"), # because we say 'mille', not 'un mille' ) graph_mille_billion += delete_hyphen + ( graph_millions | pynutil.insert("000") + pynutil.delete("billions") ) # allow for 'mil millones' graph_mille_billion |= pynutil.insert("000000", weight=0.1) graph_billiards = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("billiards") | pynutil.delete("billiard")), pynutil.insert("000", weight=0.1), ) graph_trillions = pynini.union( # One thousand English trillions. graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("trillions") | pynutil.delete("trillion")), pynutil.insert("000", weight=0.1), ) graph_trilliards = pynini.union( graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + (pynutil.delete("trilliards") | pynutil.delete("trilliard")), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_trilliards + delete_hyphen + graph_trillions + delete_hyphen + graph_billiards + delete_hyphen + graph_billions + delete_hyphen + graph_milliards + delete_hyphen + graph_millions + delete_hyphen + graph_thousands + delete_hyphen + graph_hundreds_component, graph_tens_of_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference( NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0") graph = rewrite(graph) self.graph_no_exception = graph.optimize() # save self.numbers_up_to_thousand for use in DecimalFst digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) numbers_up_to_thousand = pynini.compose( graph, digits_up_to_thousand).optimize() self.numbers_up_to_thousand = numbers_up_to_thousand # save self.numbers_up_to_million for use in DecimalFst digits_up_to_million = (NEMO_DIGIT | (NEMO_DIGIT**2) | (NEMO_DIGIT**3) | (NEMO_DIGIT**4) | (NEMO_DIGIT**5) | (NEMO_DIGIT**6)) numbers_up_to_million = pynini.compose( graph, digits_up_to_million).optimize() self.numbers_up_to_million = numbers_up_to_million # don't convert cardinals from zero to nine inclusive graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input') self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("moins", "\"-\"") + NEMO_SPACE, 0, 1) final_graph = optional_minus_graph + pynutil.insert( "integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, cardinal: GraphFst, deterministic: bool, lm: bool = False): super().__init__(name="date", kind="classify", deterministic=deterministic) # january month_graph = pynini.string_file(get_abs_path("data/date/month_name.tsv")).optimize() # January, JANUARY month_graph |= pynini.compose(TO_LOWER + pynini.closure(NEMO_CHAR), month_graph) | pynini.compose( TO_LOWER ** (2, ...), month_graph ) # jan month_abbr_graph = pynini.string_file(get_abs_path("data/date/month_abbr.tsv")).optimize() # jan, Jan, JAN month_abbr_graph = ( month_abbr_graph | pynini.compose(TO_LOWER + pynini.closure(NEMO_LOWER, 1), month_abbr_graph).optimize() | pynini.compose(TO_LOWER ** (2, ...), month_abbr_graph).optimize() ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph.optimize() month_numbers_labels = pynini.string_file(get_abs_path("data/date/month_number.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(cardinal_graph=cardinal_graph, deterministic=deterministic) # three_digit_year = (NEMO_DIGIT @ cardinal_graph) + insert_space + (NEMO_DIGIT ** 2) @ cardinal_graph # year_graph |= three_digit_year month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") month_numbers_graph = pynutil.insert("month: \"") + month_numbers_labels + pynutil.insert("\"") endings = ["rd", "th", "st", "nd"] endings += [x.upper() for x in endings] endings = pynini.union(*endings) day_graph = ( pynutil.insert("day: \"") + pynini.closure(pynutil.delete("the "), 0, 1) + ( ((pynini.union("1", "2") + NEMO_DIGIT) | NEMO_DIGIT | (pynini.accep("3") + pynini.union("0", "1"))) + pynini.closure(pynutil.delete(endings), 0, 1) ) @ cardinal_graph + pynutil.insert("\"") ) two_digit_year = _get_two_digit_year( cardinal_graph=cardinal_graph, single_digits_graph=cardinal.single_digits_graph ) two_digit_year = pynutil.insert("year: \"") + two_digit_year + pynutil.insert("\"") # if lm: # two_digit_year = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (3), two_digit_year) # year_graph = pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (2), year_graph) # year_graph |= pynini.compose(pynini.difference(NEMO_DIGIT, "0") + NEMO_DIGIT ** (4, ...), year_graph) graph_year = pynutil.insert(" year: \"") + pynutil.delete(" ") + year_graph + pynutil.insert("\"") graph_year |= ( pynutil.insert(" year: \"") + pynini.accep(",") + pynini.closure(pynini.accep(" "), 0, 1) + year_graph + pynutil.insert("\"") ) optional_graph_year = pynini.closure(graph_year, 0, 1) year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert("\"") graph_mdy = month_graph + ( (delete_extra_space + day_graph) | (pynini.accep(" ") + day_graph) | graph_year | (delete_extra_space + day_graph + graph_year) ) graph_mdy |= ( month_graph + pynini.cross("-", " ") + day_graph + pynini.closure(((pynini.cross("-", " ") + NEMO_SIGMA) @ graph_year), 0, 1) ) for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_mdy |= ( month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year day_ex_month = (NEMO_DIGIT ** 2 - pynini.project(month_numbers_graph, "input")) @ day_graph for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_dmy |= ( day_ex_month + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + (year_graph | two_digit_year) ) graph_ymd = pynini.accep("") for x in ["-", "/", "."]: delete_sep = pynutil.delete(x) graph_ymd |= ( (year_graph | two_digit_year) + delete_sep + insert_space + month_numbers_graph + delete_sep + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph = graph_mdy | graph_dmy if not deterministic or lm: final_graph += pynini.closure(pynutil.insert(" preserve_order: true"), 0, 1) m_sep_d = ( month_numbers_graph + pynutil.delete(pynini.union("-", "/")) + insert_space + pynini.closure(pynutil.delete("0"), 0, 1) + day_graph ) final_graph |= m_sep_d else: final_graph += pynutil.insert(" preserve_order: true") final_graph |= graph_ymd | year_graph if not deterministic or lm: ymd_to_mdy_graph = None ymd_to_dmy_graph = None mdy_to_dmy_graph = None md_to_dm_graph = None for month in [x[0] for x in load_labels(get_abs_path("data/date/month_name.tsv"))]: for day in [x[0] for x in load_labels(get_abs_path("data/date/day.tsv"))]: ymd_to_mdy_curr = ( pynutil.insert("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr) ymd_to_mdy_graph = ( ymd_to_mdy_curr if ymd_to_mdy_graph is None else pynini.union(ymd_to_mdy_curr, ymd_to_mdy_graph) ) ymd_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynini.accep('year:') + NEMO_SIGMA + pynutil.delete(" month: \"" + month + "\" day: \"" + day + "\"") ) # YY-MM-DD -> MM-DD-YY ymd_to_dmy_curr = pynini.compose(graph_ymd, ymd_to_dmy_curr).optimize() ymd_to_dmy_graph = ( ymd_to_dmy_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_curr, ymd_to_dmy_graph) ) mdy_to_dmy_curr = ( pynutil.insert("day: \"" + day + "\" month: \"" + month + "\" ") + pynutil.delete("month: \"" + month + "\" day: \"" + day + "\" ") + pynini.accep('year:') + NEMO_SIGMA ).optimize() # MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991) mdy_to_dmy_curr = pynini.compose(graph_mdy, mdy_to_dmy_curr).optimize() mdy_to_dmy_graph = ( mdy_to_dmy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_curr, mdy_to_dmy_graph).optimize() ).optimize() md_to_dm_curr = pynutil.insert("day: \"" + day + "\" month: \"" + month + "\"") + pynutil.delete( "month: \"" + month + "\" day: \"" + day + "\"" ) md_to_dm_curr = pynini.compose(m_sep_d, md_to_dm_curr).optimize() md_to_dm_graph = ( md_to_dm_curr if md_to_dm_graph is None else pynini.union(md_to_dm_curr, md_to_dm_graph).optimize() ).optimize() final_graph |= mdy_to_dmy_graph | md_to_dm_graph | ymd_to_mdy_graph | ymd_to_dmy_graph final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self, *features: Feature) -> None: """Sets up an acceptor for the defined category. The acceptor will accept a sequence valid values for each feature, where the ordering is given by the lexicographic order of the name of the features --- i.e. the order in which they are given to the constructor is irrelevant. If one has previously defined: case = Feature("case", "nom", "acc", "gen", "dat") gen = Feature("gen", "mas", "fem", "neu") num = Feature("num", "sg", "pl") Then noun = Category(case, gen, num) will allow any sequence in ([case=nom] | [case=nom] | [case=acc] | [case=gen] | [case=dat]) + ([gen=mas] | [gen=fem] | [gen=neu]) + ([num=sg] | [num=pl]) The feature_filler fills in missing feature values with either the default for the given feature if there is one, otherwise all possible values. So if we have case: nom, gen, acc, n/a num: sg, pl where "n/a" is the default feature (specified with the default keyword to the Feature), then [num=sg] will be filled to [case=n/a][num=sg] but [case=gen] will be filled to [case=gen]([num=sg]|[num=pl]) Args: *features: one or more Features. """ if not features: Error("No features provided to Category object") self._features = sorted(features, key=operator.attrgetter("name")) self._acceptor = _concatstar(f.acceptor for f in self._features) self._feature_mapper = self._make_feature_mapper() transducers = [] for f in self._features: default = f.default_acceptor if f.default_acceptor else f.acceptor transducers.append(pynutil.insert(default) | f.acceptor) self._feature_filler = _concatstar(transducers).optimize() self._feature_labels = pynini.project(self._feature_mapper, "input") self._sigma_star = pynini.union(byte.BYTE, self._feature_labels).closure().optimize()
def __init__(self): super().__init__(name="cardinal", kind="classify") graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundred = pynini.cross("hundred", "") graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), ) graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) ) self.graph_hundred_component_at_least_one_none_zero_digit = ( graph_hundred_component_at_least_one_none_zero_digit ) graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("thousand"), pynutil.insert("000", weight=0.1), ) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("million"), pynutil.insert("000", weight=0.1), ) graph_billion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("billion"), pynutil.insert("000", weight=0.1), ) graph_trillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("trillion"), pynutil.insert("000", weight=0.1), ) graph_quadrillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quadrillion"), pynutil.insert("000", weight=0.1), ) graph_quintillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("quintillion"), pynutil.insert("000", weight=0.1), ) graph_sextillion = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("sextillion"), pynutil.insert("000", weight=0.1), ) graph = pynini.union( graph_sextillion + delete_space + graph_quintillion + delete_space + graph_quadrillion + delete_space + graph_trillion + delete_space + graph_billion + delete_space + graph_million + delete_space + graph_thousands + delete_space + graph_hundred_component, graph_zero, ) graph = graph @ pynini.union( pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0" ) labels_exception = [num_to_word(x) for x in range(0, 13)] graph_exception = pynini.union(*labels_exception) graph = ( pynini.cdrewrite(pynutil.delete("and"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) @ (NEMO_ALPHA + NEMO_SIGMA) @ graph ) self.graph_no_exception = graph self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph optional_minus_graph = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"-\"") + NEMO_SPACE, 0, 1 ) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def _make_analyzer(self) -> None: """Helper function for constructing analyzer.""" self._analyzer = pynini.project(self._stems_to_forms, "output") self._analyzer @= self._deleter self._analyzer.invert().optimize()