def _preprocess_speaker_aishell2(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): speaker_dir, spk_id = speaker_dir trans_dict = others_params["trans_dict"] detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"] metadata = [] wav_fpath_list = speaker_dir.glob("*." + suffix) utt_fpath_list = list(speaker_dir.glob("*." + suffix)) utt_num = len(utt_fpath_list) # Iterate over each wav for wav_fpath in wav_fpath_list: assert wav_fpath.exists(), str(wav_fpath) + " not exist." # Process each utterance wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) wav_abs_max = np.max(np.abs(wav)) wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8 wav = wav / wav_abs_max * hparams.rescaling_max # norm # wav_bak = wav # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 30) # top_db: smaller for noisy # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate) # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'), # hparams.sample_rate) text = trans_dict[wav_fpath.stem] # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) random_uttBasename_forSpkEmbedding = None if detach_label_and_embed_utt: random_uttBasename_forSpkEmbedding = utt_fpath_list[ np.random.randint(utt_num)].stem metadata.append( process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams, random_uttBasename_forSpkEmbedding)) return [m for m in metadata if m is not None]
def _preprocess_speaker_SLR38(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"] wav_fpath_list = speaker_dir.glob("*." + suffix) text_fpath_list = speaker_dir.glob("*.txt") metadata = [] # Iterate over each wav utt_fpath_list = list(speaker_dir.glob("*." + suffix)) utt_num = len(utt_fpath_list) for wav_fpath, txt_fpath in zip(wav_fpath_list, text_fpath_list): assert wav_fpath.exists(), str(wav_fpath) + " not exist." assert txt_fpath.exists(), str(wav_fpath) + " not exist." # Process each utt wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max # wav_bak = wav # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 30) # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate) # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'), # hparams.sample_rate) # get text text = txt_fpath.read_text() # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) random_uttBasename_forSpkEmbedding = None if detach_label_and_embed_utt: random_uttBasename_forSpkEmbedding = utt_fpath_list[ np.random.randint(utt_num)].stem metadata.append( process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams, random_uttBasename_forSpkEmbedding)) return [m for m in metadata if m is not None]
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None): """ Applies the preprocessing operations used in training the Speaker Encoder to a waveform either on disk or in memory. The waveform will be resampled to match the data hyperparameters. :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not just .wav), either the waveform as a numpy array of floats. :param source_sr: if passing an audio waveform, the sampling rate of the waveform before preprocessing. After preprocessing, the waveform's sampling rate will match the data hyperparameters. If passing a filepath, the sampling rate will be automatically detected and this argument will be ignored. """ # Load the wav from disk if needed if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): wav, source_sr = librosa.load(str(fpath_or_wav), sr=sampling_rate) else: wav = fpath_or_wav # Resample the wav if needed # if source_sr is not None and source_sr != sampling_rate: # wav = librosa.resample(wav, source_sr, sampling_rate) wav_abs_max = np.max(np.abs(wav)) wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8 wav = wav / wav_abs_max * 0.9 # # Apply the preprocessing: normalize volume and shorten long silences # wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) # wav = trim_long_silences(wav) # save_wav(wav, fpath_or_wav.name, sampling_rate) # TODO: rm DEBUG # denoise if len(wav) > sampling_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(sampling_rate * 0.15)], wav[-int(sampling_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, sampling_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = trim_silence(wav, 30) # top_db: smaller for noisy wav = trim_long_silences(wav) # save_wav(wav, fpath_or_wav.name.replace(".wav","_trimed.wav"), sampling_rate) # TODO: rm DEBUG return wav
def load_preprocess_wav(fpath): """ Loads and preprocesses an audio file under the same conditions the audio files were used to train the synthesizer. """ wav = librosa.load(fpath, hparams.sample_rate)[0] wav = wav / np.abs(wav).max() * hparams.rescaling_max # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 20) # top_db: smaller for noisy return wav
def _preprocess_speaker_SLR68(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): trans_dict = others_params["trans_dict"] metadata = [] wav_fpath_list = speaker_dir.glob("*." + suffix) # Iterate over each entry in the alignments file for wav_fpath in wav_fpath_list: assert wav_fpath.exists(), str(wav_fpath) + " not exist." # Process each utterance wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) # wav_bak = wav wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max # norm # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 20) # top_db: smaller for noisy # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate) # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'), # hparams.sample_rate) text = trans_dict[wav_fpath.name]["text"] # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) metadata.append( process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams)) return [m for m in metadata if m is not None]
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None): # 读取音频 wav, source_sr = librosa.load(str(fpath_or_wav), sr=sampling_rate) wav_abs_max = np.max(np.abs(wav)) wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8 wav = wav / wav_abs_max * 0.9 # 去噪 if len(wav) > sampling_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(sampling_rate * 0.15)], wav[-int(sampling_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, sampling_rate) wav = logmmse.denoise(wav, profile, eta=0) # 去除静音 wav = librosa.effects.trim(wav, top_db=30, frame_length=512, hop_length=128)[0] return wav
def split_on_silences(wav_fpath, words, end_times, hparams): # Load the audio waveform wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) wav = wav / np.abs(wav).max() * hparams.rescaling_max # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.1)], wav[-int(hparams.sample_rate * 0.1):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) words = np.array(words) start_times = np.array([0.0] + end_times[:-1]) end_times = np.array(end_times) assert len(words) == len(end_times) == len(start_times) assert words[0] == "" and words[-1] == "" # Find pauses that are too long mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) mask[0] = mask[-1] = True breaks = np.where(mask)[0] # first dim indexs # Profile the noise from the silences and perform noise reduction on the waveform silence_times = [[start_times[i], end_times[i]] for i in breaks] silence_times = (np.array(silence_times) * hparams.sample_rate).astype( np.int) noisy_wav = np.concatenate( [wav[stime[0]:stime[1]] for stime in silence_times]) if len(noisy_wav) > hparams.sample_rate * 0.02: profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # Re-attach(Re-join) segments that are too short segments = list(zip(breaks[:-1], breaks[1:])) segment_durations = [ start_times[end] - end_times[start] for start, end in segments ] i = 0 while i < len(segments) and len(segments) > 1: if segment_durations[i] < hparams.utterance_min_duration: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = float( "inf") if i == len(segments) - 1 else segment_durations[i + 1] joined_duration = segment_durations[i] + min( left_duration, right_duration) # Do not re-attach if it causes the joined utterance to be too long if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: i += 1 continue # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration del segments[j + 1], segment_durations[j + 1] else: i += 1 # Split the utterance segment_times = [[end_times[start], start_times[end]] for start, end in segments] segment_times = (np.array(segment_times) * hparams.sample_rate).astype( np.int) wavs = [ wav[segment_time[0]:segment_time[1]] for segment_time in segment_times ] # [N_seg, seg_time] texts = [ " ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments ] # [N_seg] # # DEBUG: play the audio segments (run with -n=1) # import sounddevice as sd # if len(wavs) > 1: # print("This sentence was split in %d segments:" % len(wavs)) # else: # print("There are no silences long enough for this sentence to be split:") # for wav, text in zip(wavs, texts): # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early # # when playing them. You shouldn't need to do that in your parsers. # wav = np.concatenate((wav, [0] * 16000)) # print("\t%s" % text) # sd.play(wav, 16000, blocking=True) # print("") return wavs, texts
def split_on_silences(wav_fpath, words, end_times, hparams): # Load the audio waveform wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max text = ''.join(words) return [wav], [text] words = np.array(words) start_times = np.array([0.0] + end_times[:-1]) end_times = np.array(end_times) assert len(words) == len(end_times) == len(start_times) assert words[0] == "" and words[-1] == "" # Find pauses that are too long mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) mask[0] = mask[-1] = True breaks = np.where(mask)[0] # Profile the noise from the silences and perform noise reduction on the waveform silence_times = [[start_times[i], end_times[i]] for i in breaks] silence_times = (np.array(silence_times) * hparams.sample_rate).astype( np.int) noisy_wav = np.concatenate( [wav[stime[0]:stime[1]] for stime in silence_times]) if len(noisy_wav) > hparams.sample_rate * 0.02: profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # Re-attach segments that are too short segments = list(zip(breaks[:-1], breaks[1:])) segment_durations = [ start_times[end] - end_times[start] for start, end in segments ] i = 0 while i < len(segments) and len(segments) > 1: if segment_durations[i] < hparams.utterance_min_duration: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = float( "inf") if i == len(segments) - 1 else segment_durations[i + 1] joined_duration = segment_durations[i] + min( left_duration, right_duration) # Do not re-attach if it causes the joined utterance to be too long if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: i += 1 continue # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration del segments[j + 1], segment_durations[j + 1] else: i += 1 # Split the utterance segment_times = [[end_times[start], start_times[end]] for start, end in segments] segment_times = (np.array(segment_times) * hparams.sample_rate).astype( np.int) wavs = [ wav[segment_time[0]:segment_time[1]] for segment_time in segment_times ] texts = [ " ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments ] return wavs, texts
def split_on_silences(wav_fpath, words, end_times, hparams): """ wav_fpath: one single audio file of speaker words: all words of that file from alignment file with empty string ("") on silence end_times: timing info for that file from alignment file hparams: audio processing params -> need to trace back this load audio file -> reuired find long pauses -> not required remove noise from them and reattach them to origin wav -> not required split sentense on pauses and return arrays of all sentenses with wav for those sentences -> required """ # Load the audio waveform wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max words = np.array(words) start_times = np.array([0.0] + end_times[:-1]) end_times = np.array(end_times) print(f"words {words} start time {start_times} end time {end_times}") assert len(words) == len(end_times) == len(start_times) assert words[0] == "" and words[-1] == "" # Find pauses that are too long mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) mask[0] = mask[-1] = True breaks = np.where(mask)[0] # Profile the noise from the silences and perform noise reduction on the waveform silence_times = [[start_times[i], end_times[i]] for i in breaks] silence_times = (np.array(silence_times) * hparams.sample_rate).astype( np.int) noisy_wav = np.concatenate( [wav[stime[0]:stime[1]] for stime in silence_times]) if len(noisy_wav) > hparams.sample_rate * 0.02: profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # Re-attach segments that are too short segments = list(zip(breaks[:-1], breaks[1:])) segment_durations = [ start_times[end] - end_times[start] for start, end in segments ] i = 0 while i < len(segments) and len(segments) > 1: if segment_durations[i] < hparams.utterance_min_duration: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = float( "inf") if i == len(segments) - 1 else segment_durations[i + 1] joined_duration = segment_durations[i] + min( left_duration, right_duration) # Do not re-attach if it causes the joined utterance to be too long if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: i += 1 continue # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration del segments[j + 1], segment_durations[j + 1] else: i += 1 # Split the utterance segment_times = [[end_times[start], start_times[end]] for start, end in segments] segment_times = (np.array(segment_times) * hparams.sample_rate).astype( np.int) wavs = [ wav[segment_time[0]:segment_time[1]] for segment_time in segment_times ] texts = [ " ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments ] print(f"length of all wavs {len(wavs)} all texts {texts}") # # DEBUG: play the audio segments (run with -n=1) # import sounddevice as sd # if len(wavs) > 1: # print("This sentence was split in %d segments:" % len(wavs)) # else: # print("There are no silences long enough for this sentence to be split:") # for wav, text in zip(wavs, texts): # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early # # when playing them. You shouldn't need to do that in your parsers. # wav = np.concatenate((wav, [0] * 16000)) # print("\t%s" % text) # sd.play(wav, 16000, blocking=True) # print("") return wavs, texts