def _get_tagged_text(self, text, n_tagged): """ Returns text after tokenize and classify Args; text: input text n_tagged: number of tagged options to consider, -1 - return all possible tagged options """ if n_tagged == -1: if self.lang == "en": # this to keep arpabet phonemes in the list of options if "[" in text and "]" in text: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: try: tagged_texts = rewrite.rewrites( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": # this to keep arpabet phonemes in the list of options if "[" in text and "]" in text: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: try: # try self.tagger graph that produces output without digits tagged_texts = rewrite.top_rewrites( text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) return tagged_texts
def normalize( self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False, ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ assert ( len(text.split()) < 500 ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words" original_text = text text = pre_process(text) # to handle [] text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if self.lm: if self.lang not in ["en"]: raise ValueError(f"{self.lang} is not supported in LM mode") if self.lang == "en": try: lattice = rewrite.rewrite_lattice( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: lattice = rewrite.rewrite_lattice(text, self.tagger.fst) lattice = rewrite.lattice_to_nshortest(lattice, n_tagged) tagged_texts = [(x[1], float(x[2])) for x in lattice.paths().items()] tagged_texts.sort(key=lambda x: x[1]) tagged_texts, weights = list(zip(*tagged_texts)) else: if n_tagged == -1: if self.lang == "en": try: tagged_texts = rewrite.rewrites( text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": try: tagged_texts = rewrite.top_rewrites( text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == "en": normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts, verbose=verbose) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [ self.processor.detokenize([t]) for t in normalized_texts ] normalized_texts = [ post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts ] if self.lm: return normalized_texts, weights normalized_texts = set(normalized_texts) return normalized_texts
def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False,) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ original_text = text if self.lang == "en": text = pre_process(text) text = text.strip() if not text: if verbose: print(text) return text text = pynini.escape(text) if n_tagged == -1: if self.lang == "en": try: tagged_texts = rewrite.rewrites(text, self.tagger.fst_no_digits) except pynini.lib.rewrite.Error: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: tagged_texts = rewrite.rewrites(text, self.tagger.fst) else: if self.lang == "en": try: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst_no_digits, nshortest=n_tagged) except pynini.lib.rewrite.Error: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) else: tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) # non-deterministic Eng normalization uses tagger composed with verbalizer, no permutation in between if self.lang == "en": normalized_texts = tagged_texts else: normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts, verbose=verbose) if len(normalized_texts) == 0: raise ValueError() if punct_post_process: # do post-processing based on Moses detokenizer if self.processor: normalized_texts = [self.processor.detokenize([t]) for t in normalized_texts] normalized_texts = [ post_process_punct(input=original_text, normalized_text=t) for t in normalized_texts ] else: print("NEMO_NLP collection is not available: skipping punctuation post_processing") normalized_texts = set(normalized_texts) return normalized_texts