class TestDate: inverse_normalizer = InverseNormalizer() if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file( 'data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None normalizer_with_audio = NormalizerWithAudio( input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file('data_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_uncased(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=100) assert expected in pred_non_deterministic normalizer_uppercased = Normalizer( input_case='cased') if PYNINI_AVAILABLE else None cases_uppercased = { "Aug. 8": "august eighth", "8 Aug.": "the eighth of august", "aug. 8": "august eighth" } @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=100) assert expected in pred_non_deterministic
def __process_data(dataset_path, stat_path, min_duration, max_duration, val_size, test_size, seed_for_ds_split): # Create normalizer text_normalizer = Normalizer( lang="de", input_case="cased", overwrite_cache=True, cache_dir=str(dataset_path / "cache_dir"), ) text_normalizer_call_kwargs = { "punct_pre_process": True, "punct_post_process": True } normalizer_call = lambda x: text_normalizer.normalize( x, **text_normalizer_call_kwargs) entries = [] with open(stat_path) as f: # Let's skip the header f.readline() for line in tqdm(f): file_stem, duration, *_, text = line.strip().split("|") duration = float(duration) # file_stem -> dir_name (e.g. maerchen_01_f000051 -> maerchen, ber_psychoanalyse_01_f000046 -> ber_psychoanalyse) dir_name = "_".join(file_stem.split("_")[:-2]) audio_path = dataset_path / dir_name / "wavs" / f"{file_stem}.wav" if min_duration <= duration <= max_duration: normalized_text = normalizer_call(text) entry = { 'audio_filepath': str(audio_path), 'duration': duration, 'text': text, 'normalized_text': normalized_text, } entries.append(entry) random.Random(seed_for_ds_split).shuffle(entries) train_size = len(entries) - val_size - test_size assert train_size > 0, "Not enough data for train, val and test" def save(p, data): with open(p, 'w') as f: for d in data: f.write(json.dumps(d) + '\n') save(dataset_path / "train_manifest.json", entries[:train_size]) save(dataset_path / "val_manifest.json", entries[train_size:train_size + val_size]) save(dataset_path / "test_manifest.json", entries[train_size + val_size:])
def __process_data(data_root, whitelist_path): if whitelist_path is None: wget.download( "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv", out=str(data_root), ) whitelist_path = data_root / "whitelist_lj_speech.tsv" text_normalizer = Normalizer( lang="en", input_case="cased", whitelist=whitelist_path, overwrite_cache=True, cache_dir=data_root / "cache_dir", ) text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True} normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs) # Create manifests (based on predefined NVIDIA's split) filelists = ['train', 'val', 'test'] for split in tqdm(filelists): # Download file list if necessary filelist_path = data_root / f"ljs_audio_text_{split}_filelist.txt" if not filelist_path.exists(): wget.download(f"{FILELIST_BASE}/ljs_audio_text_{split}_filelist.txt", out=str(data_root)) manifest_target = data_root / f"{split}_manifest.json" with open(manifest_target, 'w') as f_out: with open(filelist_path, 'r') as filelist: print(f"\nCreating {manifest_target}...") for line in tqdm(filelist): basename = line[6:16] text = line[21:].strip() norm_text = normalizer_call(text) # Make sure corresponding wavfile exists wav_path = data_root / 'wavs' / f"{basename}.wav" assert wav_path.exists(), f"{wav_path} does not exist!" entry = { 'audio_filepath': str(wav_path), 'duration': sox.file_info.duration(wav_path), 'text': text, 'normalized_text': norm_text, } f_out.write(json.dumps(entry) + '\n')
class TestSpecialText: normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS else None) @parameterized.expand( parse_test_case_file( 'en/data_text_normalization/test_cases_special_text.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected
class TestRoman: normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS else None) # address is tagged by the measure class @parameterized.expand( parse_test_case_file('en/data_text_normalization/test_cases_roman.txt') ) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): # pred = self.normalizer_en.normalize(test_input, verbose=False) # assert pred == expected # # if self.normalizer_with_audio_en: # pred_non_deterministic = self.normalizer_with_audio_en.normalize( # test_input, n_tagged=30, punct_post_process=False, # ) # assert expected in pred_non_deterministic pass
class TestBoundary: normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and CACHE_DIR else None) @parameterized.expand( parse_test_case_file( 'en/data_text_normalization/test_cases_boundary.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, n_tagged=30, punct_post_process=False) assert expected in pred_non_deterministic
class TestWord: inverse_normalizer = InverseNormalizer() @parameterized.expand( parse_test_case_file( 'data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='lower_cased') @parameterized.expand( parse_test_case_file('data_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected
class TestCardinal: inverse_normalizer_en = InverseNormalizer(lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = Normalizer(input_case='cased', lang='en') if PYNINI_AVAILABLE else None normalizer_with_audio_en = NormalizerWithAudio(input_case='cased', lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic
class TestBoundary: normalizer = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None normalizer_with_audio = NormalizerWithAudio( input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand( parse_test_case_file('data_text_normalization/test_cases_boundary.txt') ) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( test_input, n_tagged=100, punct_pre_process=False, punct_post_process=False) assert expected in pred_non_deterministic
class TestWhitelist: inverse_normalizer_en = InverseNormalizer(lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None normalizer_with_audio_en = NormalizerWithAudio(input_case='cased', lang='en') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('en/data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic normalizer_uppercased = Normalizer(input_case='cased', lang='en') if PYNINI_AVAILABLE else None cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"} @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio_en.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic
class TestWord: inverse_normalizer_en = (InverseNormalizer( lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) @parameterized.expand( parse_test_case_file( 'en/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS else None) @parameterized.expand( parse_test_case_file('en/data_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected, f"input: {test_input} != {expected}" if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, n_tagged=3, punct_post_process=False) assert expected in pred_non_deterministic, f"input: {test_input}"
class TestWhitelist: inverse_normalizer = InverseNormalizer() @parameterized.expand(parse_test_case_file('data_inverse_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer = Normalizer(input_case='lower_cased') @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected normalizer_uppercased = Normalizer(input_case='cased') cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"} @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected
class TestPunctuation: normalizer_en = (Normalizer( input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True, ) if PYNINI_AVAILABLE else None) # address is tagged by the measure class @parameterized.expand( parse_test_case_file( 'en/data_text_normalization/test_cases_punctuation.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=True, punct_post_process=False) assert pred == expected, f"input: {test_input} != {expected}" @parameterized.expand( parse_test_case_file( 'en/data_text_normalization/test_cases_punctuation_match_input.txt' )) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_python_punct_post_process(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=True, punct_post_process=True) assert pred == expected, f"input: {test_input} != {expected}"
type=str, default=None, choices=known_types, ) parser.add_argument("--filter", action='store_true', help="clean data for normalization purposes") return parser.parse_args() if __name__ == "__main__": # Example usage: # python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter args = parse_args() if args.language == 'en': from nemo_text_processing.text_normalization.en.clean_eval_data import filter_loaded_data file_path = args.input normalizer = Normalizer(input_case=args.input_case, lang=args.language) print("Loading training data: " + file_path) training_data = load_files([file_path]) if args.filter: training_data = filter_loaded_data(training_data) if args.category is None: print("Sentence level evaluation...") sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(training_data) print("- Data: " + str(len(sentences_normalized)) + " sentences") sentences_prediction = normalizer.normalize_list(sentences_un_normalized) print("- Normalized. Evaluating...") sentences_accuracy = evaluate( preds=sentences_prediction, labels=sentences_normalized, input=sentences_un_normalized
def split_text( in_file: str, out_file: str, vocabulary: List[str] = None, language='eng', remove_brackets=True, do_lower_case=True, min_length=0, max_length=100, additional_split_symbols=None, use_nemo_normalization=False, ): """ Breaks down the in_file roughly into sentences. Each sentence will be on a separate line. Written form of the numbers will be converted to its spoken equivalent, OOV punctuation will be removed. Args: in_file: path to original transcript out_file: path to the output file vocabulary: ASR model vocabulary language: text language remove_brackets: Set to True if square [] and curly {} brackets should be removed from text. Text in square/curly brackets often contains inaudible fragments like notes or translations do_lower_case: flag that determines whether to apply lower case to the in_file text min_length: Min number of chars of the text segment for alignment. Short segments will be combined to be at least min_length (not recommended for multi speaker data). max_length: Max number of chars of the text segment for alignment additional_split_symbols: Additional symbols to use for sentence split if eos sentence split resulted in segments longer than --max_length use_nemo_normalization: Set to True to use NeMo normalization tool to convert numbers from written to spoken format. Normalization using num2words will be applied afterwards to make sure there are no numbers present in the text, otherwise they will be replaced with a space and that could deteriorate segmentation results. """ print(f'Splitting text in {in_file} into sentences.') with open(in_file, "r") as f: transcript = f.read() # remove some symbols for better split into sentences transcript = (transcript.replace("\n", " ").replace("\t", " ").replace( "…", "...").replace("\\", " ").replace("--", " -- ").replace(". . .", "...").replace("‘", "’")) # remove extra space transcript = re.sub(r' +', ' ', transcript) transcript = re.sub(r'(\.+)', '. ', transcript) if remove_brackets: transcript = re.sub(r'(\[.*?\])', ' ', transcript) # remove text in curly brackets transcript = re.sub(r'(\{.*?\})', ' ', transcript) lower_case_unicode = '' upper_case_unicode = '' if language == 'ru': lower_case_unicode = '\u0430-\u04FF' upper_case_unicode = '\u0410-\u042F' elif language not in ['ru', 'eng']: print( f'Consider using {language} unicode letters for better sentence split.' ) # remove space in the middle of the lower case abbreviation to avoid splitting into separate sentences matches = re.findall( r'[a-z' + lower_case_unicode + ']\.\s[a-z' + lower_case_unicode + ']\.', transcript) for match in matches: transcript = transcript.replace(match, match.replace('. ', '.')) # find phrases in quotes with_quotes = re.finditer(r'“[A-Za-z ?]+.*?”', transcript) sentences = [] last_idx = 0 for m in with_quotes: match = m.group() match_idx = m.start() if last_idx < match_idx: sentences.append(transcript[last_idx:match_idx]) sentences.append(match) last_idx = m.end() sentences.append(transcript[last_idx:]) sentences = [s.strip() for s in sentences if s.strip()] # Read and split transcript by utterance (roughly, sentences) split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s" new_sentences = [] for sent in sentences: new_sentences.extend(regex.split(split_pattern, sent)) sentences = [s.strip() for s in new_sentences if s.strip()] def additional_split(sentences, split_on_symbols, max_length): if len(split_on_symbols) == 0: return sentences split_on_symbols = split_on_symbols.split('|') def _split(sentences, delimiter, max_length): result = [] for s in sentences: if len(s) <= max_length: result.append(s) else: split_sent = s.split(delimiter) result.extend([s + delimiter for s in split_sent[:-1]] + [split_sent[-1]]) return result another_sent_split = [] for sent in sentences: split_sent = [sent] for delimiter in split_on_symbols: split_sent = _split(split_sent, delimiter + ' ', max_length) another_sent_split.extend(split_sent) sentences = [s.strip() for s in another_sent_split if s.strip()] return sentences sentences = additional_split(sentences, additional_split_symbols, max_length) # check to make sure there will be no utterances for segmentation with only OOV symbols vocab_no_space_with_digits = set(vocabulary + [i for i in range(10)]) vocab_no_space_with_digits.remove(' ') sentences = [ s for s in sentences if len(vocab_no_space_with_digits.intersection(set(s))) > 0 ] if min_length > 0: sentences_comb = [] sentences_comb.append(sentences[0]) # combines short sentence for i in range(1, len(sentences)): if len(sentences_comb[-1]) < min_length or len( sentences[i]) < min_length: sentences_comb[-1] += ' ' + sentences[i].strip() else: sentences_comb.append(sentences[i].strip()) sentences = sentences_comb sentences = [s.strip() for s in sentences if s.strip()] # save split text with original punctuation and case out_dir, out_file_name = os.path.split(out_file) with open(os.path.join(out_dir, out_file_name[:-4] + '_with_punct.txt'), "w") as f: f.write("\n".join(sentences)) # substitute common abbreviations before applying lower case if language == 'ru': for k, v in RU_ABBREVIATIONS.items(): sentences = [s.replace(k, v) for s in sentences] if language == 'ru': # replace Latin characters with Russian for k, v in LATIN_TO_RU.items(): sentences = [s.replace(k, v) for s in sentences] if language == 'eng' and use_nemo_normalization: if not NEMO_NORMALIZATION_AVAILABLE: raise ValueError(f'NeMo normalization tool is not installed.') print('Using NeMo normalization tool...') normalizer = Normalizer(input_case='cased') sentences_norm = normalizer.normalize_list(sentences, verbose=False) if len(sentences_norm) != len(sentences): raise ValueError( f'Normalization failed, number of sentences does not match.') sentences = '\n'.join(sentences) # replace numbers with num2words try: p = re.compile("\d+") new_text = '' match_end = 0 for i, m in enumerate(p.finditer(sentences)): match = m.group() match_start = m.start() if i == 0: new_text = sentences[:match_start] else: new_text += sentences[match_end:match_start] match_end = m.end() new_text += sentences[match_start:match_end].replace( match, num2words(match, lang=language)) new_text += sentences[match_end:] sentences = new_text except NotImplementedError: print( f'{language} might be missing in "num2words" package. Add required language to the choices for the' f'--language argument.') raise sentences = (sentences.replace("’", "'").replace("»", '"').replace( "«", '"').replace("\\", "").replace("”", '"').replace("„", '"').replace( "´", "'").replace("-- --", "--").replace("--", " -- ").replace( "’", "'").replace('“', '"').replace('“', '"').replace( "‘", "'").replace('—', '-').replace("- -", "--").replace( '`', "'").replace(' !', '!').replace(' ?', '?').replace( ' ,', ',').replace(' .', '.').replace( ' ;', ';').replace(' :', ':').replace( '!!', '!').replace('--', '-').replace( '“', '"').replace(', , ', ', ').replace('=', '')) allowed_punct = [',', '.', '?', '!', ':', ';', '-', '"', '(', ')'] # clean up normalized text and keep only allowed_punct and ASR vocabulary (lower and upper case) symbols_to_remove = ''.join( set(sentences).difference( set(vocabulary + [s.upper() for s in vocabulary] + ['\n'] + allowed_punct))) sentences_norm = sentences.translate(''.maketrans( symbols_to_remove, len(symbols_to_remove) * ' ')) with open( os.path.join(out_dir, out_file_name[:-4] + '_with_punct_normalized.txt'), "w") as f: f.write(sentences_norm) if do_lower_case: sentences = sentences.lower() # remove all OOV symbols symbols_to_remove = ''.join( set(sentences).difference(set(vocabulary + ['\n']))) sentences = sentences.translate(''.maketrans(symbols_to_remove, len(symbols_to_remove) * ' ')) # remove extra space sentences = re.sub(r' +', ' ', sentences) with open(out_file, "w") as f: f.write(sentences)
def main(): args = get_args() ljspeech_dir = args.ljspeech_dir # Download LJSpeech dataset if needed if args.download_ljspeech: get_lj_speech(args.ljspeech_dir) ljspeech_dir = os.path.join(args.ljspeech_dir, "LJSpeech-1.1") # Create normalizer if args.normalizer_class == "ENCharParser": normalizer_call = parsers.make_parser(name='en')._normalize elif args.normalizer_class == "Normalizer": whitelist_path = args.whitelist_path if whitelist_path is None: wget.download( "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv", out=ljspeech_dir, ) whitelist_path = os.path.join(ljspeech_dir, "whitelist_lj_speech.tsv") text_normalizer = Normalizer( lang="en", input_case="cased", whitelist=whitelist_path, overwrite_cache=True, cache_dir=os.path.join(ljspeech_dir, "cache_dir"), ) text_normalizer_call_kwargs = { "punct_pre_process": True, "punct_post_process": True } normalizer_call = lambda x: text_normalizer.normalize( x, **text_normalizer_call_kwargs) else: raise ValueError("normalizer_class must be ENCharParser or Normalizer") # Create manifests (based on predefined NVIDIA's split) and optionally save transcripts in .txt files filelist_base = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists' filelists = ['train', 'val', 'test'] for split in filelists: # Download file list if necessary filelist_path = os.path.join(ljspeech_dir, f"ljs_audio_text_{split}_filelist.txt") if not os.path.exists(filelist_path): wget.download( f"{filelist_base}/ljs_audio_text_{split}_filelist.txt", out=ljspeech_dir) manifest_target = os.path.join(ljspeech_dir, f"ljspeech_{split}.json") with open(manifest_target, 'w') as f_out: with open(filelist_path, 'r') as filelist: print(f"\nCreating {manifest_target}...") for line in filelist: basename = line[6:16] text = line[21:].strip() norm_text = normalizer_call(text) # Make sure corresponding wavfile exists wav_path = os.path.join(ljspeech_dir, 'wavs', basename + '.wav') assert os.path.exists(wav_path) if args.save_transcripts_in_txt: txt_path = os.path.join(ljspeech_dir, 'wavs', basename + '.txt') with open(txt_path, 'w') as f_txt: f_txt.write(norm_text) # Write manifest entry entry = { 'audio_filepath': wav_path, 'duration': sox.file_info.duration(wav_path), 'text': norm_text if args.manifest_text_var_is_normalized else text, 'normalized_text': norm_text, } f_out.write(json.dumps(entry) + '\n')
def split_text( in_file: str, out_file: str, vocabulary: List[str], language="en", remove_brackets=True, do_lower_case=True, max_length=100, additional_split_symbols=None, use_nemo_normalization=False, ): """ Breaks down the in_file roughly into sentences. Each sentence will be on a separate line. Written form of the numbers will be converted to its spoken equivalent, OOV punctuation will be removed. Args: in_file: path to original transcript out_file: path to the output file vocabulary: ASR model vocabulary language: text language remove_brackets: Set to True if square [] and curly {} brackets should be removed from text. Text in square/curly brackets often contains inaudible fragments like notes or translations do_lower_case: flag that determines whether to apply lower case to the in_file text max_length: Max number of words of the text segment for alignment additional_split_symbols: Additional symbols to use for sentence split if eos sentence split resulted in segments longer than --max_length use_nemo_normalization: Set to True to use NeMo normalization tool to convert numbers from written to spoken format. Normalization using num2words will be applied afterwards to make sure there are no numbers present in the text, otherwise they will be replaced with a space and that could deteriorate segmentation results. """ print(f"Splitting text in {in_file} into sentences.") with open(in_file, "r") as f: transcript = f.read() # remove some symbols for better split into sentences transcript = (transcript.replace("\n", " ").replace("\t", " ").replace( "…", "...").replace("\\", " ").replace("--", " -- ").replace(". . .", "...")) # remove extra space transcript = re.sub(r" +", " ", transcript) transcript = re.sub(r"(\.+)", ". ", transcript) if remove_brackets: transcript = re.sub(r'(\[.*?\])', ' ', transcript) # remove text in curly brackets transcript = re.sub(r'(\{.*?\})', ' ', transcript) lower_case_unicode = '' upper_case_unicode = '' if language == "ru": lower_case_unicode = '\u0430-\u04FF' upper_case_unicode = '\u0410-\u042F' elif language not in ["ru", "en"]: print( f"Consider using {language} unicode letters for better sentence split." ) # remove space in the middle of the lower case abbreviation to avoid splitting into separate sentences matches = re.findall( r'[a-z' + lower_case_unicode + ']\.\s[a-z' + lower_case_unicode + ']\.', transcript) for match in matches: transcript = transcript.replace(match, match.replace('. ', '.')) # find phrases in quotes with_quotes = re.finditer(r'“[A-Za-z ?]+.*?”', transcript) sentences = [] last_idx = 0 for m in with_quotes: match = m.group() match_idx = m.start() if last_idx < match_idx: sentences.append(transcript[last_idx:match_idx]) sentences.append(match) last_idx = m.end() sentences.append(transcript[last_idx:]) sentences = [s.strip() for s in sentences if s.strip()] # Read and split transcript by utterance (roughly, sentences) split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s" new_sentences = [] for sent in sentences: new_sentences.extend(regex.split(split_pattern, sent)) sentences = [s.strip() for s in new_sentences if s.strip()] def additional_split(sentences, split_on_symbols): if len(split_on_symbols) == 0: return sentences split_on_symbols = split_on_symbols.split("|") def _split(sentences, delimiter): result = [] for sent in sentences: split_sent = sent.split(delimiter) # keep the delimiter split_sent = [(s + delimiter).strip() for s in split_sent[:-1]] + [split_sent[-1]] if "," in delimiter: # split based on comma usually results in too short utterance, combine sentences # that result in a single word split. It's usually not recommended to do that for other delimiters. comb = [] for s in split_sent: MIN_LEN = 2 # if the previous sentence is too short, combine it with the current sentence if len(comb) > 0 and (len(comb[-1].split()) <= MIN_LEN or len(s.split()) <= MIN_LEN): comb[-1] = comb[-1] + " " + s else: comb.append(s) result.extend(comb) else: result.extend(split_sent) return result another_sent_split = [] for sent in sentences: split_sent = [sent] for delimiter in split_on_symbols: split_sent = _split(split_sent, delimiter + " ") another_sent_split.extend(split_sent) sentences = [s.strip() for s in another_sent_split if s.strip()] return sentences sentences = additional_split(sentences, additional_split_symbols) vocabulary_symbols = [] for x in vocabulary: if x != "<unk>": # for BPE models vocabulary_symbols.extend( [x for x in x.replace("##", "").replace("▁", "")]) vocabulary_symbols = list(set(vocabulary_symbols)) vocabulary_symbols += [x.upper() for x in vocabulary_symbols] # check to make sure there will be no utterances for segmentation with only OOV symbols vocab_no_space_with_digits = set(vocabulary_symbols + [str(i) for i in range(10)]) if " " in vocab_no_space_with_digits: vocab_no_space_with_digits.remove(" ") sentences = [ s.strip() for s in sentences if len(vocab_no_space_with_digits.intersection(set(s.lower()))) > 0 and s.strip() ] # when no punctuation marks present in the input text, split based on max_length if len(sentences) == 1: sent = sentences[0].split() sentences = [] for i in range(0, len(sent), max_length): sentences.append(" ".join(sent[i:i + max_length])) sentences = [s.strip() for s in sentences if s.strip()] # save split text with original punctuation and case out_dir, out_file_name = os.path.split(out_file) with open(os.path.join(out_dir, out_file_name[:-4] + "_with_punct.txt"), "w") as f: f.write(re.sub(r' +', ' ', "\n".join(sentences))) # substitute common abbreviations before applying lower case if language == "ru": for k, v in RU_ABBREVIATIONS.items(): sentences = [s.replace(k, v) for s in sentences] # replace Latin characters with Russian for k, v in LATIN_TO_RU.items(): sentences = [s.replace(k, v) for s in sentences] if language == "en" and use_nemo_normalization: if not NEMO_NORMALIZATION_AVAILABLE: raise ValueError("NeMo normalization tool is not installed.") print("Using NeMo normalization tool...") normalizer = Normalizer(input_case="cased", cache_dir=os.path.join( os.path.dirname(out_file), "en_grammars")) sentences_norm = normalizer.normalize_list(sentences, verbose=False, punct_post_process=True) if len(sentences_norm) != len(sentences): raise ValueError( "Normalization failed, number of sentences does not match.") else: sentences = sentences_norm sentences = '\n'.join(sentences) # replace numbers with num2words try: p = re.compile("\d+") new_text = "" match_end = 0 for i, m in enumerate(p.finditer(sentences)): match = m.group() match_start = m.start() if i == 0: new_text = sentences[:match_start] else: new_text += sentences[match_end:match_start] match_end = m.end() new_text += sentences[match_start:match_end].replace( match, num2words(match, lang=language)) new_text += sentences[match_end:] sentences = new_text except NotImplementedError: print( f"{language} might be missing in 'num2words' package. Add required language to the choices for the" f"--language argument.") raise sentences = re.sub(r' +', ' ', sentences) with open( os.path.join(out_dir, out_file_name[:-4] + "_with_punct_normalized.txt"), "w") as f: f.write(sentences) if do_lower_case: sentences = sentences.lower() symbols_to_remove = ''.join( set(sentences).difference(set(vocabulary_symbols + ["\n", " "]))) sentences = sentences.translate(''.maketrans(symbols_to_remove, len(symbols_to_remove) * " ")) # remove extra space sentences = re.sub(r' +', ' ', sentences) with open(out_file, "w") as f: f.write(sentences)
class TestDate: inverse_normalizer_en = (InverseNormalizer( lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) @parameterized.expand( parse_test_case_file( 'en/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) assert pred == expected normalizer_en = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) if PYNINI_AVAILABLE else None) normalizer_with_audio_en = (NormalizerWithAudio(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE and RUN_AUDIO_BASED_TESTS else None) @parameterized.expand( parse_test_case_file('en/data_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_uncased(self, test_input, expected): pred = self.normalizer_en.normalize(test_input, verbose=False) assert pred == expected if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, punct_post_process=False, n_tagged=100) assert expected in pred_non_deterministic, f"INPUT: {test_input}" normalizer_uppercased = (Normalizer(input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None) cases_uppercased = { "Aug. 8": "august eighth", "8 Aug.": "the eighth of august", "aug. 8": "august eighth" } @parameterized.expand(cases_uppercased.items()) @pytest.mark.skipif( not PYNINI_AVAILABLE, reason= "`pynini` not installed, please install via nemo_text_processing/setup.sh" ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected if self.normalizer_with_audio_en: pred_non_deterministic = self.normalizer_with_audio_en.normalize( test_input, punct_post_process=False, n_tagged=30) assert expected in pred_non_deterministic
""" with open(file_path, 'w') as fp: for line in data: fp.write(line + '\n') def parse_args(): parser = ArgumentParser() parser.add_argument("--input", help="input file path", required=True, type=str) parser.add_argument("--language", help="language", choices=['en'], default="en", type=str) parser.add_argument("--output", help="output file path", required=True, type=str) parser.add_argument( "--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str ) parser.add_argument("--verbose", help="print meta info for debugging", action='store_true') return parser.parse_args() if __name__ == "__main__": args = parse_args() file_path = args.input normalizer = Normalizer(input_case=args.input_case, lang=args.language) print("Loading data: " + file_path) data = load_file(file_path) print("- Data: " + str(len(data)) + " sentences") normalizer_prediction = normalizer.normalize_list(data, verbose=args.verbose) write_file(args.output, normalizer_prediction) print(f"- Normalized. Writing out to {args.output}")