示例#1
0
def split_on_silences(wav_fpath, words, end_times, hparams):
    # Load the audio waveform
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    text = ''.join(words)
    return [wav], [text]
    words = np.array(words)
    start_times = np.array([0.0] + end_times[:-1])
    end_times = np.array(end_times)
    assert len(words) == len(end_times) == len(start_times)
    assert words[0] == "" and words[-1] == ""

    # Find pauses that are too long
    mask = (words == "") & (end_times - start_times >=
                            hparams.silence_min_duration_split)
    mask[0] = mask[-1] = True
    breaks = np.where(mask)[0]

    # Profile the noise from the silences and perform noise reduction on the waveform
    silence_times = [[start_times[i], end_times[i]] for i in breaks]
    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(
        np.int)
    noisy_wav = np.concatenate(
        [wav[stime[0]:stime[1]] for stime in silence_times])
    if len(noisy_wav) > hparams.sample_rate * 0.02:
        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # Re-attach segments that are too short
    segments = list(zip(breaks[:-1], breaks[1:]))
    segment_durations = [
        start_times[end] - end_times[start] for start, end in segments
    ]
    i = 0
    while i < len(segments) and len(segments) > 1:
        if segment_durations[i] < hparams.utterance_min_duration:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i -
                                                                          1]
            right_duration = float(
                "inf") if i == len(segments) - 1 else segment_durations[i + 1]
            joined_duration = segment_durations[i] + min(
                left_duration, right_duration)

            # Do not re-attach if it causes the joined utterance to be too long
            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
                i += 1
                continue

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            del segments[j + 1], segment_durations[j + 1]
        else:
            i += 1

    # Split the utterance
    segment_times = [[end_times[start], start_times[end]]
                     for start, end in segments]
    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(
        np.int)
    wavs = [
        wav[segment_time[0]:segment_time[1]] for segment_time in segment_times
    ]
    texts = [
        " ".join(words[start + 1:end]).replace("  ", " ")
        for start, end in segments
    ]
    return wavs, texts
示例#2
0
def split_on_silences(wav_fpath, words, end_times, hparams):
    """
    wav_fpath: one single audio file of speaker
    words: all words of that file from alignment file with empty string ("") on silence
    end_times: timing info for that file from alignment file
    hparams: audio processing params -> need to trace back this

    load audio file -> reuired
    find long pauses -> not required
    remove noise from them and reattach them to origin wav -> not required
    split sentense on pauses and return arrays of all sentenses with wav for those sentences -> required
    """

    # Load the audio waveform
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    words = np.array(words)

    start_times = np.array([0.0] + end_times[:-1])
    end_times = np.array(end_times)
    print(f"words {words} start time {start_times} end time {end_times}")

    assert len(words) == len(end_times) == len(start_times)
    assert words[0] == "" and words[-1] == ""

    # Find pauses that are too long
    mask = (words == "") & (end_times - start_times >=
                            hparams.silence_min_duration_split)
    mask[0] = mask[-1] = True
    breaks = np.where(mask)[0]

    # Profile the noise from the silences and perform noise reduction on the waveform
    silence_times = [[start_times[i], end_times[i]] for i in breaks]
    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(
        np.int)
    noisy_wav = np.concatenate(
        [wav[stime[0]:stime[1]] for stime in silence_times])
    if len(noisy_wav) > hparams.sample_rate * 0.02:
        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # Re-attach segments that are too short
    segments = list(zip(breaks[:-1], breaks[1:]))
    segment_durations = [
        start_times[end] - end_times[start] for start, end in segments
    ]
    i = 0
    while i < len(segments) and len(segments) > 1:
        if segment_durations[i] < hparams.utterance_min_duration:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i -
                                                                          1]
            right_duration = float(
                "inf") if i == len(segments) - 1 else segment_durations[i + 1]
            joined_duration = segment_durations[i] + min(
                left_duration, right_duration)

            # Do not re-attach if it causes the joined utterance to be too long
            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
                i += 1
                continue

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            del segments[j + 1], segment_durations[j + 1]
        else:
            i += 1

    # Split the utterance
    segment_times = [[end_times[start], start_times[end]]
                     for start, end in segments]
    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(
        np.int)
    wavs = [
        wav[segment_time[0]:segment_time[1]] for segment_time in segment_times
    ]
    texts = [
        " ".join(words[start + 1:end]).replace("  ", " ")
        for start, end in segments
    ]

    print(f"length of all wavs {len(wavs)} all texts {texts}")
    # # DEBUG: play the audio segments (run with -n=1)
    # import sounddevice as sd
    # if len(wavs) > 1:
    #     print("This sentence was split in %d segments:" % len(wavs))
    # else:
    #     print("There are no silences long enough for this sentence to be split:")
    # for wav, text in zip(wavs, texts):
    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
    #     # when playing them. You shouldn't need to do that in your parsers.
    #     wav = np.concatenate((wav, [0] * 16000))
    #     print("\t%s" % text)
    #     sd.play(wav, 16000, blocking=True)
    # print("")

    return wavs, texts
示例#3
0
def split_on_silences(wav_fpath, words, end_times, hparams):
    # Load the audio waveform
    print('222222222', hparams.sample_rate)
    # print(type(hparams.sample_rate))
    # print(hparams.sample_rate)
    # print(hparams.rescale)
    # print(hparams.rescaling_max)
    # print(hparams.silence_min_duration_split)
    # print(hparams.utterance_min_duration)
    # print(hparams.hop_size)
    # print(hparams.max_mel_frames)
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    print('wav1', wav.shape)
    words = np.array(words)
    print('words2', words)
    print('words2', len(words))
    start_times = np.array([0.0] + end_times[:-1])
    print('starttime', start_times)
    print('starttime', len(start_times))
    end_times = np.array(end_times)
    print('endtimes2', end_times)
    print('endtimes2', len(end_times))
    assert len(words) == len(end_times) == len(start_times)
    assert words[0] == "" and words[-1] == ""

    # Find pauses that are too long
    mask = (words == "") & (end_times - start_times >=
                            hparams.silence_min_duration_split)
    print('mask', mask)
    mask[0] = mask[-1] = True
    print(mask[0])
    print(mask[-1])
    breaks = np.where(mask)[0]
    print('break', breaks)
    # Profile the noise from the silences and perform noise reduction on the waveform
    silence_times = [[start_times[i], end_times[i]] for i in breaks]
    print('st0', silence_times)
    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(
        np.int)
    print('st', silence_times)
    noisy_wav = np.concatenate(
        [wav[stime[0]:stime[1]] for stime in silence_times])
    # for stime in silence_times:
    #     print(stime[0])
    #     print(stime[1])
    #     print(wav[stime[0]:stime[1]])
    #     print(len(wav[stime[0]:stime[1]]))
    print('noisywav', noisy_wav)

    if len(noisy_wav) > hparams.sample_rate * 0.02:
        print('1')
        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
        #print('pro', profile.shape)
        wav = logmmse.denoise(wav, profile, eta=0)

    # Re-attach segments that are too short
    segments = list(zip(breaks[:-1], breaks[1:]))
    print(breaks)
    print(breaks[:-1])
    print(breaks[1:])
    print('segments', segments)
    segment_durations = [
        start_times[end] - end_times[start] for start, end in segments
    ]
    i = 0
    while i < len(segments) and len(segments) > 1:
        print('2')
        if segment_durations[i] < hparams.utterance_min_duration:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i -
                                                                          1]
            right_duration = float(
                "inf") if i == len(segments) - 1 else segment_durations[i + 1]
            joined_duration = segment_durations[i] + min(
                left_duration, right_duration)

            # Do not re-attach if it causes the joined utterance to be too long
            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
                print('3')
                i += 1
                continue

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            del segments[j + 1], segment_durations[j + 1]
        else:
            i += 1
    print(wav)
    print(words)
    # Split the utterance
    segment_times = [[end_times[start], start_times[end]]
                     for start, end in segments]
    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(
        np.int)
    print('st', segment_times)
    wavs = [
        wav[segment_time[0]:segment_time[1]] for segment_time in segment_times
    ]
    print(wavs)
    texts = [
        " ".join(words[start + 1:end]).replace("  ", " ")
        for start, end in segments
    ]

    # # DEBUG: play the audio segments (run with -n=1)
    # import sounddevice as sd
    # if len(wavs) > 1:
    #     print("This sentence was split in %d segments:" % len(wavs))
    # else:
    #     print("There are no silences long enough for this sentence to be split:")
    # for wav, text in zip(wavs, texts):
    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
    #     # when playing them. You shouldn't need to do that in your parsers.
    #     wav = np.concatenate((wav, [0] * 16000))
    #     print("\t%s" % text)
    #     sd.play(wav, 16000, blocking=True)
    # print("")
    print('9999999999999999999', wavs)
    print(len(wavs))
    print('888888888888888888', texts)
    print(len(texts))
    return wavs, texts