def split_on_silences(wav_fpath, words, end_times, hparams): # Load the audio waveform wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max text = ''.join(words) return [wav], [text] words = np.array(words) start_times = np.array([0.0] + end_times[:-1]) end_times = np.array(end_times) assert len(words) == len(end_times) == len(start_times) assert words[0] == "" and words[-1] == "" # Find pauses that are too long mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) mask[0] = mask[-1] = True breaks = np.where(mask)[0] # Profile the noise from the silences and perform noise reduction on the waveform silence_times = [[start_times[i], end_times[i]] for i in breaks] silence_times = (np.array(silence_times) * hparams.sample_rate).astype( np.int) noisy_wav = np.concatenate( [wav[stime[0]:stime[1]] for stime in silence_times]) if len(noisy_wav) > hparams.sample_rate * 0.02: profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # Re-attach segments that are too short segments = list(zip(breaks[:-1], breaks[1:])) segment_durations = [ start_times[end] - end_times[start] for start, end in segments ] i = 0 while i < len(segments) and len(segments) > 1: if segment_durations[i] < hparams.utterance_min_duration: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = float( "inf") if i == len(segments) - 1 else segment_durations[i + 1] joined_duration = segment_durations[i] + min( left_duration, right_duration) # Do not re-attach if it causes the joined utterance to be too long if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: i += 1 continue # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration del segments[j + 1], segment_durations[j + 1] else: i += 1 # Split the utterance segment_times = [[end_times[start], start_times[end]] for start, end in segments] segment_times = (np.array(segment_times) * hparams.sample_rate).astype( np.int) wavs = [ wav[segment_time[0]:segment_time[1]] for segment_time in segment_times ] texts = [ " ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments ] return wavs, texts
def split_on_silences(wav_fpath, words, end_times, hparams): """ wav_fpath: one single audio file of speaker words: all words of that file from alignment file with empty string ("") on silence end_times: timing info for that file from alignment file hparams: audio processing params -> need to trace back this load audio file -> reuired find long pauses -> not required remove noise from them and reattach them to origin wav -> not required split sentense on pauses and return arrays of all sentenses with wav for those sentences -> required """ # Load the audio waveform wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max words = np.array(words) start_times = np.array([0.0] + end_times[:-1]) end_times = np.array(end_times) print(f"words {words} start time {start_times} end time {end_times}") assert len(words) == len(end_times) == len(start_times) assert words[0] == "" and words[-1] == "" # Find pauses that are too long mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) mask[0] = mask[-1] = True breaks = np.where(mask)[0] # Profile the noise from the silences and perform noise reduction on the waveform silence_times = [[start_times[i], end_times[i]] for i in breaks] silence_times = (np.array(silence_times) * hparams.sample_rate).astype( np.int) noisy_wav = np.concatenate( [wav[stime[0]:stime[1]] for stime in silence_times]) if len(noisy_wav) > hparams.sample_rate * 0.02: profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # Re-attach segments that are too short segments = list(zip(breaks[:-1], breaks[1:])) segment_durations = [ start_times[end] - end_times[start] for start, end in segments ] i = 0 while i < len(segments) and len(segments) > 1: if segment_durations[i] < hparams.utterance_min_duration: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = float( "inf") if i == len(segments) - 1 else segment_durations[i + 1] joined_duration = segment_durations[i] + min( left_duration, right_duration) # Do not re-attach if it causes the joined utterance to be too long if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: i += 1 continue # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration del segments[j + 1], segment_durations[j + 1] else: i += 1 # Split the utterance segment_times = [[end_times[start], start_times[end]] for start, end in segments] segment_times = (np.array(segment_times) * hparams.sample_rate).astype( np.int) wavs = [ wav[segment_time[0]:segment_time[1]] for segment_time in segment_times ] texts = [ " ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments ] print(f"length of all wavs {len(wavs)} all texts {texts}") # # DEBUG: play the audio segments (run with -n=1) # import sounddevice as sd # if len(wavs) > 1: # print("This sentence was split in %d segments:" % len(wavs)) # else: # print("There are no silences long enough for this sentence to be split:") # for wav, text in zip(wavs, texts): # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early # # when playing them. You shouldn't need to do that in your parsers. # wav = np.concatenate((wav, [0] * 16000)) # print("\t%s" % text) # sd.play(wav, 16000, blocking=True) # print("") return wavs, texts
def split_on_silences(wav_fpath, words, end_times, hparams): # Load the audio waveform print('222222222', hparams.sample_rate) # print(type(hparams.sample_rate)) # print(hparams.sample_rate) # print(hparams.rescale) # print(hparams.rescaling_max) # print(hparams.silence_min_duration_split) # print(hparams.utterance_min_duration) # print(hparams.hop_size) # print(hparams.max_mel_frames) wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max print('wav1', wav.shape) words = np.array(words) print('words2', words) print('words2', len(words)) start_times = np.array([0.0] + end_times[:-1]) print('starttime', start_times) print('starttime', len(start_times)) end_times = np.array(end_times) print('endtimes2', end_times) print('endtimes2', len(end_times)) assert len(words) == len(end_times) == len(start_times) assert words[0] == "" and words[-1] == "" # Find pauses that are too long mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) print('mask', mask) mask[0] = mask[-1] = True print(mask[0]) print(mask[-1]) breaks = np.where(mask)[0] print('break', breaks) # Profile the noise from the silences and perform noise reduction on the waveform silence_times = [[start_times[i], end_times[i]] for i in breaks] print('st0', silence_times) silence_times = (np.array(silence_times) * hparams.sample_rate).astype( np.int) print('st', silence_times) noisy_wav = np.concatenate( [wav[stime[0]:stime[1]] for stime in silence_times]) # for stime in silence_times: # print(stime[0]) # print(stime[1]) # print(wav[stime[0]:stime[1]]) # print(len(wav[stime[0]:stime[1]])) print('noisywav', noisy_wav) if len(noisy_wav) > hparams.sample_rate * 0.02: print('1') profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) #print('pro', profile.shape) wav = logmmse.denoise(wav, profile, eta=0) # Re-attach segments that are too short segments = list(zip(breaks[:-1], breaks[1:])) print(breaks) print(breaks[:-1]) print(breaks[1:]) print('segments', segments) segment_durations = [ start_times[end] - end_times[start] for start, end in segments ] i = 0 while i < len(segments) and len(segments) > 1: print('2') if segment_durations[i] < hparams.utterance_min_duration: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = float( "inf") if i == len(segments) - 1 else segment_durations[i + 1] joined_duration = segment_durations[i] + min( left_duration, right_duration) # Do not re-attach if it causes the joined utterance to be too long if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: print('3') i += 1 continue # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration del segments[j + 1], segment_durations[j + 1] else: i += 1 print(wav) print(words) # Split the utterance segment_times = [[end_times[start], start_times[end]] for start, end in segments] segment_times = (np.array(segment_times) * hparams.sample_rate).astype( np.int) print('st', segment_times) wavs = [ wav[segment_time[0]:segment_time[1]] for segment_time in segment_times ] print(wavs) texts = [ " ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments ] # # DEBUG: play the audio segments (run with -n=1) # import sounddevice as sd # if len(wavs) > 1: # print("This sentence was split in %d segments:" % len(wavs)) # else: # print("There are no silences long enough for this sentence to be split:") # for wav, text in zip(wavs, texts): # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early # # when playing them. You shouldn't need to do that in your parsers. # wav = np.concatenate((wav, [0] * 16000)) # print("\t%s" % text) # sd.play(wav, 16000, blocking=True) # print("") print('9999999999999999999', wavs) print(len(wavs)) print('888888888888888888', texts) print(len(texts)) return wavs, texts