def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") # decimal, fraction, cardinal, units, style(depr) cardinal_graph = cardinal.graph_no_exception graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_singular = pynini.invert(graph_unit) # singular -> abbr graph_unit_plural = get_singulars(graph_unit_singular) # plural -> abbr optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("minus", "\"true\"") + delete_extra_space, 0, 1 ) unit_singular = convert_space(graph_unit_singular) unit_plural = convert_space(graph_unit_plural) unit_misc = pynutil.insert("/") + pynutil.delete("per") + delete_space + convert_space(graph_unit_singular) unit_singular = ( pynutil.insert("units: \"") + (unit_singular | unit_misc | pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)) + pynutil.insert("\"") ) unit_plural = ( pynutil.insert("units: \"") + (unit_plural | unit_misc | pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)) + pynutil.insert("\"") ) subgraph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal.final_graph_wo_negative + pynutil.insert(" }") + delete_extra_space + unit_plural ) subgraph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_plural ) subgraph_cardinal |= ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + pynutil.insert(" }") + delete_extra_space + unit_singular ) final_graph = subgraph_decimal | subgraph_cardinal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="time", kind="classify") # hours, minutes, seconds, suffix, zone, style, speak_period suffix_graph = pynini.string_file(get_abs_path("data/time_suffix.tsv")) time_zone_graph = pynini.invert( pynini.string_file(get_abs_path("data/time_zone.tsv"))) # only used for < 1000 thousand -> 0 weight cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) labels_hour = [num_to_word(x) for x in range(0, 24)] labels_minute_single = [num_to_word(x) for x in range(1, 10)] labels_minute_double = [num_to_word(x) for x in range(10, 60)] graph_hour = pynini.union(*labels_hour) @ cardinal graph_minute_single = pynini.union(*labels_minute_single) @ cardinal graph_minute_double = pynini.union(*labels_minute_double) @ cardinal graph_minute_verbose = pynini.cross("half", "30") | pynini.cross( "quarter", "15") oclock = pynini.cross( pynini.union("o' clock", "o clock", "o'clock", "oclock"), "") final_graph_hour = pynutil.insert( "hours: \"") + graph_hour + pynutil.insert("\"") final_graph_minute = ( pynutil.insert("minutes: \"") + (pynutil.insert("00") | oclock + pynutil.insert("00") | pynutil.delete("o") + delete_space + graph_minute_single | graph_minute_double) + pynutil.insert("\"")) final_suffix = pynutil.insert("suffix: \"") + convert_space( suffix_graph) + pynutil.insert("\"") final_suffix_optional = pynini.closure( delete_space + insert_space + final_suffix, 0, 1) final_time_zone_optional = pynini.closure( delete_space + insert_space + pynutil.insert("zone: \"") + convert_space(time_zone_graph) + pynutil.insert("\""), 0, 1, ) # five o' clock # two o eight, two thiry five (am/pm) # two pm/am graph_hm = final_graph_hour + delete_extra_space + final_graph_minute # 10 past four, quarter past four, half past four graph_mh = (pynutil.insert("minutes: \"") + pynini.union( graph_minute_single, graph_minute_double, graph_minute_verbose) + pynutil.insert("\"") + delete_space + pynutil.delete("past") + delete_extra_space + final_graph_hour) final_graph = ((graph_hm | graph_mh) + final_suffix_optional + final_time_zone_optional).optimize() final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()
def __init__(self): super().__init__(name="whitelist", kind="classify") whitelist = pynini.string_file( get_abs_path("data/whitelist.tsv")).invert() graph = pynutil.insert("name: \"") + convert_space( whitelist) + pynutil.insert("\"") self.fst = graph.optimize()
def __init__(self): super().__init__(name="word", kind="classify") exceptions = pynini.string_file( get_abs_path("data/sentence_boundary_exceptions.txt")) word = (pynutil.insert("name: \"") + (pynini.closure(pynutil.add_weight(NEMO_NOT_SPACE, weight=0.1), 1) | convert_space(exceptions)) + pynutil.insert("\"")) self.fst = word.optimize()
def __init__(self): super().__init__(name="punctuation", kind="classify") medium_punct = pynini.union(",", ";", "(", ")") long_punct = pynini.union(".", "!", "?", ":") medium = ( pynutil.insert("tokens { name: \"") + convert_space(medium_punct) + pynutil.insert("\"") + pynutil.insert(" pause_length: \"") + convert_space( pynutil.insert("PAUSE_MEDIUM phrase_break: true type: PUNCT")) + pynutil.insert("\" }")) loong = ( pynutil.insert("tokens { name: \"") + convert_space(long_punct) + pynutil.insert("\"") + pynutil.insert(" pause_length: \"") + convert_space( pynutil.insert("PAUSE_LONG phrase_break: true type: PUNCT")) + pynutil.insert("\" }")) graph = medium | loong self.fst = graph.optimize()
def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") # quantity, integer_part, fractional_part, currency cardinal_graph = cardinal.graph_no_exception graph_decimal_final = decimal.final_graph_wo_negative unit = pynini.string_file(get_abs_path("data/currency.tsv")) unit_singular = pynini.invert(unit) unit_plural = get_singulars(unit_singular) graph_unit_singular = pynutil.insert("currency: \"") + convert_space( unit_singular) + pynutil.insert("\"") graph_unit_plural = pynutil.insert("currency: \"") + convert_space( unit_plural) + pynutil.insert("\"") add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( pynutil.insert("0") + NEMO_DIGIT) # twelve dollars (and) fifty cents, zero cents cents_standalone = (pynutil.insert("fractional_part: \"") + pynini.union( pynutil.add_weight( ((NEMO_SIGMA - "one") @ cardinal_graph), -0.7) @ add_leading_zero_to_double_digit + delete_space + pynutil.delete("cents"), pynini.cross("one", "01") + delete_space + pynutil.delete("cent"), ) + pynutil.insert("\"")) optional_cents_standalone = pynini.closure( delete_space + pynini.closure(pynutil.delete("and") + delete_space, 0, 1) + insert_space + cents_standalone, 0, 1, ) # twelve dollars fifty, only after integer optional_cents_suffix = pynini.closure( delete_extra_space + pynutil.insert("fractional_part: \"") + pynutil.add_weight( cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + pynutil.insert("\""), 0, 1, ) graph_integer = (pynutil.insert("integer_part: \"") + ((NEMO_SIGMA - "one") @ cardinal_graph) + pynutil.insert("\"") + delete_extra_space + graph_unit_plural + (optional_cents_standalone | optional_cents_suffix)) graph_integer |= (pynutil.insert("integer_part: \"") + pynini.cross("one", "1") + pynutil.insert("\"") + delete_extra_space + graph_unit_singular + (optional_cents_standalone | optional_cents_suffix)) graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural graph_decimal |= pynutil.insert( "currency: \"$\" integer_part: \"0\" ") + cents_standalone final_graph = graph_integer | graph_decimal final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize()