def inverse_normalize(text: str, verbose: bool) -> str: """ main function. normalizes spoken tokens in given text to its written form e.g. twelve kilograms -> 12 kg Args: text: string that may include semiotic classes. Returns: written form """ text = pynini.escape(text) tagged_lattice = find_tags(text) tagged_text = select_tag(tagged_lattice) parser(tagged_text) tokens = parser.parse() tags_reordered = generate_permutations(tokens) for tagged_text in tags_reordered: tagged_text = pynini.escape(tagged_text) verbalizer_lattice = find_verbalizer(tagged_text) if verbalizer_lattice.num_states() == 0: continue output = select_verbalizer(verbalizer_lattice) if verbose: print(output) return output raise ValueError()
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize() graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) if not deterministic: phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(pynini.accep(" "), 0, 1) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.closure(pynini.accep(" "), 0, 1) + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def normalize(self, text: str, verbose: bool) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes verbose: whether to print intermediate meta information Returns: spoken form """ text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) tagged_lattice = self.find_tags(text) tagged_text = self.select_tag(tagged_lattice) if verbose: print(tagged_text) self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text in tags_reordered: tagged_text = pynini.escape(tagged_text) verbalizer_lattice = self.find_verbalizer(tagged_text) if verbalizer_lattice.num_states() == 0: continue output = self.select_verbalizer(verbalizer_lattice) return output raise ValueError()
def normalize(self, text: str, verbose: bool = False, punct_pre_process: bool = False, punct_post_process: bool = False) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes verbose: whether to print intermediate meta information punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] punct_post_process: whether to normalize punctuation Returns: spoken form """ original_text = text if punct_pre_process: text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) tagged_lattice = self.find_tags(text) tagged_text = self.select_tag(tagged_lattice) if verbose: print(tagged_text) self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text in tags_reordered: tagged_text = pynini.escape(tagged_text) verbalizer_lattice = self.find_verbalizer(tagged_text) if verbalizer_lattice.num_states() == 0: continue output = self.select_verbalizer(verbalizer_lattice) if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: output = self.processor.moses_detokenizer.detokenize( [output], unescape=False) output = post_process_punct(input=original_text, normalized_text=output) else: print( "NEMO_NLP collection is not available: skipping punctuation post_processing" ) return output raise ValueError()
def normalize_with_audio(self, text: str, verbose: bool = False) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes transcript: transcription of the audio verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) def get_tagged_texts(text): tagged_lattice = self.find_tags(text) tagged_texts = self.select_all_semiotic_tags(tagged_lattice) return tagged_texts tagged_texts = set(get_tagged_texts(text)) normalized_texts = [] for tagged_text in tagged_texts: self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text_reordered in tags_reordered: tagged_text_reordered = pynini.escape(tagged_text_reordered) verbalizer_lattice = self.find_verbalizer(tagged_text_reordered) if verbalizer_lattice.num_states() == 0: continue verbalized = self.get_all_verbalizers(verbalizer_lattice) for verbalized_option in verbalized: normalized_texts.append(verbalized_option) if len(normalized_texts) == 0: raise ValueError() normalized_texts = [post_process(t) for t in normalized_texts] normalized_texts = set(normalized_texts) return normalized_texts
def _verbalize(self, tagged_text: str, normalized_texts: List[str], verbose: bool = False): """ Verbalizes tagged text Args: tagged_text: text with tags normalized_texts: list of possible normalization options verbose: if true prints intermediate classification results """ def get_verbalized_text(tagged_text): return rewrite.rewrites(tagged_text, self.verbalizer.fst) self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text_reordered in tags_reordered: try: tagged_text_reordered = pynini.escape(tagged_text_reordered) normalized_texts.extend( get_verbalized_text(tagged_text_reordered)) if verbose: print(tagged_text_reordered) except pynini.lib.rewrite.Error: continue
def normalize( self, text: str, n_tagged: int, punct_pre_process: bool = True, punct_post_process: bool = True, verbose: bool = False, ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ if punct_pre_process: text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if n_tagged == -1: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == 'en': normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: normalized_texts = [ post_process_punctuation(t) for t in normalized_texts ] # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [ self.processor.detokenize([t]) for t in normalized_texts ] normalized_texts = set(normalized_texts) return normalized_texts
def testVerifyAsciiDefinition(self): ascii_char = pynini.string_map( # UTF-8 ASCII uses the all single byte characters with most # significant bit set to 0, barring NUL, which we ignore. pynini.escape(chr(codepoint)) for codepoint in range(1, 128)).optimize() self.assertFsasEquivalent(ascii_char, utf8.SINGLE_BYTE)
def testVerifyUtf8CharRegionalIndicatorSymbolDefinition(self): regional_indicator = pynini.string_map( # Regional indicator symbols have codepoints in the range 0x1F1E6 # through 0x1F1FF. pynini.escape(chr(codepoint)) for codepoint in range(0x1F1E6, 0x1F1FF + 1)).optimize() self.assertFsasEquivalent( regional_indicator, utf8.VALID_UTF8_CHAR_REGIONAL_INDICATOR_SYMBOL)
def testVerifyUtf8Rfc3629Definition(self): utf8_rfc3629_char = pynini.string_map( # UTF-8 encoded strings can store codepoints in U+0000 through # U+0x10FFFF, excluding the surrogate halves in U+D800 through # U+DFFF, but we exclude U+0000 as it would be strange to match NUL # and that label is reserved for epsilon. pynini.escape(chr(codepoint)) for codepoint in range(1, 0x10FFFF + 1) if not 0xD800 <= codepoint <= 0xDFFF).optimize() self.assertFsasEquivalent(utf8_rfc3629_char, utf8.VALID_UTF8_CHAR)
def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) punct = PunctuationFst().graph self.graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) if not deterministic: self.graph = pynini.closure( pynini.difference( self.graph, pynini.union("$", "€", "₩", "£", "¥", "#", "$", "%") + pynini.closure(NEMO_DIGIT, 1) ), 1, ) # leave phones of format [HH AH0 L OW1] untouched phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) phoneme = ( pynini.accep(pynini.escape("[")) + pynini.closure(phoneme_unit + pynini.accep(" ")) + phoneme_unit + pynini.accep(pynini.escape("]")) ) self.graph = plurals._priority_union(convert_space(phoneme), self.graph, NEMO_SIGMA) self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize()
def _make_feature_mapper(self) -> pynini.Fst: r"""Convenience function generating a map to human-readable strings. Returns: A transducer that maps from internal symbols like "[case=nom]" to a sequence that will be readable as a string ("\[case=nom\]") for all feature-value combinations. """ pairs = [] for feature in self._features: name = feature.name for value in feature.values: f = f"[{name}={value}]" v = pynini.escape(f"[{name}={value}]") pairs.append(pynini.cross(f, v)) return pynini.union(*pairs).closure().optimize()
def post_process(self, normalized_text: 'pynini.FstLike') -> str: """ Runs post processing graph on normalized text Args: normalized_text: normalized text Returns: shortest path """ normalized_text = normalized_text.strip() if not normalized_text: return normalized_text normalized_text = pynini.escape(normalized_text) if self.post_processor is not None: normalized_text = top_rewrite(normalized_text, self.post_processor.fst) return normalized_text
def ApplyOnText(self, text: str) -> str: """Transduce the given string using the FST. Args: text: Input string to be transduced. Returns: Transduced string output. Raises: ValueError on Pynini string compilation exceptions. This operation involves pre-composing the input string with the FST and then finding the shortest path to output a resultant string. """ try: # Square brackets and backslash carry special meaning in Pynini. # So they need to be escaped for unmanaged strings. return pynini.shortestpath( pynini.escape(text) @ self._fst).string() except pynini.FstOpError as error: raise ValueError( f'{error} on the string (between quotes): `{text}`')
def normalize( self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False, ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ assert ( len(text.split()) < 500 ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words" original_text = text text = pre_process(text) # to handle [] text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if self.lm: if self.lang not in ["en"]: raise ValueError(f"{self.lang} is not supported in LM mode") if self.lang == "en": try: lattice = rewrite.rewrite_lattice( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: lattice = rewrite.rewrite_lattice(text, self.tagger.fst) lattice = rewrite.lattice_to_nshortest(lattice, n_tagged) tagged_texts = [(x[1], float(x[2])) for x in lattice.paths().items()] tagged_texts.sort(key=lambda x: x[1]) tagged_texts, weights = list(zip(*tagged_texts)) else: if n_tagged == -1: if self.lang == "en": try: tagged_texts = rewrite.rewrites( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": try: tagged_texts = rewrite.top_rewrites( text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == "en": normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts, verbose=verbose) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [ self.processor.detokenize([t]) for t in normalized_texts ] normalized_texts = [ post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts ] if self.lm: return normalized_texts, weights normalized_texts = set(normalized_texts) return normalized_texts
def normalize(self, text: str, verbose: bool = False, punct_pre_process: bool = False, punct_post_process: bool = False) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes verbose: whether to print intermediate meta information punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] punct_post_process: whether to normalize punctuation Returns: spoken form """ assert ( len(text.split()) < 500 ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words" original_text = text if punct_pre_process: text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) tagged_lattice = self.find_tags(text) tagged_text = self.select_tag(tagged_lattice) if verbose: print(tagged_text) self.parser(tagged_text) tokens = self.parser.parse() split_tokens = self._split_tokens_to_reduce_number_of_permutations( tokens) output = "" for s in split_tokens: tags_reordered = self.generate_permutations(s) verbalizer_lattice = None for tagged_text in tags_reordered: tagged_text = pynini.escape(tagged_text) verbalizer_lattice = self.find_verbalizer(tagged_text) if verbalizer_lattice.num_states() != 0: break if verbalizer_lattice is None: raise ValueError( f"No permutations were generated from tokens {s}") output += ' ' + self.select_verbalizer(verbalizer_lattice) output = SPACE_DUP.sub(' ', output[1:]) if self.lang == "en" and hasattr(self, 'post_processor'): output = self.post_process(output) if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: output = self.processor.moses_detokenizer.detokenize( [output], unescape=False) output = post_process_punct(input=original_text, normalized_text=output) else: print( "NEMO_NLP collection is not available: skipping punctuation post_processing" ) return output
def testPunct(self) -> None: for s in string.punctuation: self.assertAccepts(pynini.escape(s), byte.PUNCT)
def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False,) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ original_text = text if self.lang == "en": text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if n_tagged == -1: if self.lang == "en": try: tagged_texts = rewrite.rewrites(text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": try: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == "en": normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts, verbose=verbose) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [self.processor.detokenize([t]) for t in normalized_texts] normalized_texts = [ post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts ] else: print("NEMO_NLP collection is not available: skipping punctuation post_processing") normalized_texts = set(normalized_texts) return normalized_texts
def get_verbalized_text(tagged_text): tagged_text = pynini.escape(tagged_text) return rewrite.rewrites(tagged_text, self.verbalizer.fst)
def testAsciiBytes(self) -> None: for char in range(1, 128): self.assertAccepts(pynini.escape(chr(char)), byte.BYTE)