Пример #1
0
def _process_utterance(out_dir, in_dir, source_wav_name, target_wav_name,
                       emotion_id):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    source_wav = audio.load_wav(os.path.join(in_dir, source_wav_name))
    target_wav = audio.load_wav(os.path.join(in_dir, target_wav_name))

    if hparams.rescaling:
        source_wav = source_wav / np.abs(
            source_wav).max() * hparams.rescaling_max
        target_wav = target_wav / np.abs(
            target_wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    #s_spectrogram = audio.spectrogram(source_wav).astype(np.float32)
    t_spectrogram = audio.spectrogram(target_wav).astype(np.float32)

    # Compute a mel-scale spectrogram from the wav:
    smel_spectrogram = audio.melspectrogram(source_wav).astype(np.float32)
    tmel_spectrogram = audio.melspectrogram(target_wav).astype(np.float32)
    s_n_frames = smel_spectrogram.shape[1]
    t_n_frames = tmel_spectrogram.shape[1]

    # Write the spectrograms to disk:
    #s_spectrogram_filename = 'source-spec-{}.npy'.format(source_wav_name)
    t_spectrogram_filename = 'target-spec-{}.npy'.format(
        target_wav_name.replace('.wav', ''))
    smel_filename = 'source-mel-{}.npy'.format(
        source_wav_name.replace('.wav', ''))
    tmel_filename = 'target-mel-{}.npy'.format(
        target_wav_name.replace('.wav', ''))
    #np.save(os.path.join(out_dir, s_spectrogram_filename), s_spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, t_spectrogram_filename),
            t_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, smel_filename),
            smel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tmel_filename),
            tmel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (emotion_id, t_spectrogram_filename, smel_filename, tmel_filename,
            s_n_frames, t_n_frames)
Пример #2
0
def gen_data(audio_path, full_frames):
    wav = audio.load_wav(audio_path, 16000)
    mel = audio.melspectrogram(wav)
    print(mel.shape)

    if np.isnan(mel.reshape(-1)).sum() > 0:
        raise ValueError(
            'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again'
        )

    mel_chunks = []
    mel_idx_multiplier = 80. / fps
    i = 0
    while 1:
        start_idx = int(i * mel_idx_multiplier)
        if start_idx + mel_step_size > len(mel[0]):
            mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
            break
        mel_chunks.append(mel[:, start_idx:start_idx + mel_step_size])
        i += 1

    print("Length of mel chunks: {}".format(len(mel_chunks)))

    full_frames = full_frames[:len(mel_chunks)]

    gen = datagen(full_frames.copy(), mel_chunks)
    return gen
Пример #3
0
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    # np.save(os.path.join(out_dir, audio_filename),
    #         out.astype(out_dtype), allow_pickle=False)
    # np.save(os.path.join(out_dir, mel_filename),
    #         mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Пример #4
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'nikl-single-spec-%05d.npy' % index
    mel_filename = 'nikl-single-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Пример #5
0
 def __getitem__(self, index):
     # Read audio
     filename = self.audio_files[index]
     wav = deepaudio.load_wav(filename)
     # load in raw_audio via utils
     raw_audio, _ = utils.load_wav_to_torch(filename)
     # convert wav to numpy
     audio = torch.from_numpy(wav)
     # take segment
     if audio.size(0) >= self.segment_length:
         max_audio_start = audio.size(0) - self.segment_length
         audio_start = random.randint(0, max_audio_start)
         audio = audio[audio_start:audio_start + self.segment_length]
         # update raw audio as well
         raw_audio = raw_audio[audio_start:audio_start +
                               self.segment_length]
     else:
         audio = torch.nn.functional.pad(
             audio, (0, self.segment_length - audio.size(0)),
             'constant').data
         # pad raw audio as well
         raw_audio = torch.nn.functional.pad(
             raw_audio, (0, self.segment_length - raw_audio.size(0)),
             'constant').data
     # compute mel
     mel = deepaudio.melspectrogram(audio.numpy())
     # convert mel to torch
     mel = torch.from_numpy(mel)
     audio = utils.mu_law_encode(raw_audio / utils.MAX_WAV_VALUE,
                                 self.mu_quantization)
     return (mel, audio)
Пример #6
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
    mel_filename = 'vctk-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return spectrogram_filename, mel_filename, n_frames, text, speaker_id
def _process_utterance(out_dir, wav_path):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T

    return mel_spectrogram.astype(np.float32)
Пример #8
0
def process(info_dict):
    wav_path = os.path.join(hp.data_path, "Wave")
    wav_file_name = os.path.join(wav_path, info_dict["sentence_id"]+".wav")
    wav = audio.load_wav(wav_file_name)
    mel = audio.melspectrogram(wav).T
    mel_file_path = os.path.join(hp.mel_path, info_dict["sentence_id"]+".npy")
    np.save(mel_file_path, mel)

    phone_idx = info_dict["sentence_id"] + "|"
    for phone_duration in info_dict["alignment"]:
        phone_idx += str(phone_map[phone_duration[0]]) + " "

    duration_idx = info_dict["sentence_id"] + "|"
    length_mel = mel.shape[0]
    length_phone_list = len(info_dict["alignment"])
    cur_pointer = 0
    for frame_id in range(length_mel):
        added = False
        cur_time = hp.frame_length_ms / 2 + frame_id * hp.frame_shift_ms
        cur_time = cur_time / 1000.0
        for i in range(cur_pointer, length_phone_list):
            if cur_time >= info_dict["alignment"][i][1][0] and cur_time < info_dict["alignment"][i][1][1]:
                phone_id = phone_map[info_dict["alignment"][i][0]]
                duration_idx += str(phone_id) + " "
                cur_pointer = i
                added = True
                break
        if not added:
            phone_id = phone_map[info_dict["alignment"][cur_pointer][0]]
            duration_idx += str(phone_id) + " "

    return phone_idx[:-1], duration_idx[:-1]
Пример #9
0
def extract_MFCC_and_text(wav_file_path, mfcc_dir):

    wav_filenames = glob.glob(wav_file_path)

    for wav_fname in wav_filenames:
        text_filename = wav_fname.replace(".WAV.wav", ".TXT")
        fullname = wav_fname.split('/')[-1]
        fname = fullname.split('.')[0]

        # Process the text: remove the first two numbers from the text file
        with open(text_filename, 'r') as file:
            sentence = file.read()
        sentence = sentence.split()[2:] + ['\n']
        sentence = ' '.join(sentence).lower()
        # Write the prcoeesed text to the mfcc directory
        text_fname = mfcc_dir + '/' + fname + '.txt'
        with open(text_fname, "w") as file:
            file.write(sentence)

        # Generate the MFCC features
        wav = audio.load_wav(wav_fname)
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
        mspec_fname = mfcc_dir + '/' + fname
        np.save(mspec_fname, mel_spectrogram,
                allow_pickle=False)  #generates features of shape: L x 80

    return
Пример #10
0
def _process_utterance(out_dir, in_dir, label, speaker_name, hparams):
    wav_paths = glob.glob(os.path.join(in_dir, "*.wav"))
    if not wav_paths:
        return None

    num_samples = len(wav_paths)
    npz_dir = os.path.join(out_dir, speaker_name)
    os.makedirs(npz_dir, exist_ok=True)

    for idx, wav_path in enumerate(wav_paths):
        wav_name, ext = os.path.splitext(os.path.basename(wav_path))
        if ext == ".wav":
            wav, sr = librosa.load(wav_path, sr=hparams.sample_rate)

            # rescale wav
            if hparams.rescaling:  # hparams.rescale = True
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:  # hparams.trim_silence = True
                wav = trim_silence(wav, hparams)  # Trim leading and trailing silence

            mel = melspectrogram(wav, hparams)
            seq_len = wav.shape[0]
            frame_len = mel.shape[1]

            file_name = wav_name
            np.savez(os.path.join(out_dir, file_name), mel=mel.T, speaker=label, seq_len=seq_len, frame_len=frame_len)


    return num_samples
Пример #11
0
def gen_samples(out_dir, wav_path, n_samples):
    wav = audio.load_wav(wav_path)
    hop_size = hparams.hop_length
    seg_len = hparams.seg_len
    spec_len = hparams.spec_len
    # not sure why we have to minus 1 here ?
    wav_len = wav.shape[0] // hop_size * hop_size - 1
    wav = wav[:wav_len]
    spec = audio.spectrogram(wav)
    mel = audio.melspectrogram(wav)
    max_val = spec.shape[1] - 1 - spec_len
    if max_val < 0:
        return []
    idx = np.random.randint(0, max_val, size=(n_samples))
    d = []
    i = 0
    for offset in idx:
        i += 1
        w = wav[offset * hop_size:offset * hop_size + seg_len]
        s = spec[:, offset:offset + spec_len]
        m = mel[:, offset:offset + spec_len]
        wav_name = wav_path.split('/')[-1].split('.')[0]
        file_path = "{0}/{1}_{2:03d}.npz".format(out_dir, wav_name, i)
        np.savez(file_path, wav=w, spec=s, mel=m)
        d.append(file_path)
    return d
Пример #12
0
def test():
    wavs_path = os.path.join("data", "LJSpeech-1.1")
    wavs_path = os.path.join(wavs_path, "wavs")
    wav_path = os.path.join(wavs_path, "LJ001-0001.wav")
    wav = audio.load_wav(wav_path)
    mel_spec = audio.melspectrogram(wav)
    wav_after_inv = audio.inv_mel_spectrogram(mel_spec)
    audio.save_wav(wav_after_inv, "test.wav")
Пример #13
0
    def __getitem__(self, idx):
        while 1:
            idx = random.randint(0, len(self.all_videos) - 1)
            vidname = self.all_videos[idx]
            img_names = list(glob(join(vidname, '*.jpg')))
            if len(img_names) <= 3 * syncnet_T:
                continue
            
            img_name = random.choice(img_names)
            wrong_img_name = random.choice(img_names)
            while wrong_img_name == img_name:
                wrong_img_name = random.choice(img_names)

            window_fnames = self.get_window(img_name)
            wrong_window_fnames = self.get_window(wrong_img_name)
            if window_fnames is None or wrong_window_fnames is None:
                continue

            window = self.read_window(window_fnames)
            if window is None:
                continue

            wrong_window = self.read_window(wrong_window_fnames)
            if wrong_window is None:
                continue

            try:
                wavpath = join(vidname, "audio.wav")
                if wavpath not in self.shared_dict:
                    wav = audio.load_wav(wavpath, hparams.sample_rate)
                    orig_mel = audio.melspectrogram(wav).T
                    self.shared_dict[wavpath] = orig_mel
                else:
                    orig_mel = self.shared_dict[wavpath]
            except Exception as e:
                continue

            mel = self.crop_audio_window(orig_mel.copy(), img_name)
            
            if (mel.shape[0] != syncnet_mel_step_size):
                continue

            indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name)
            if indiv_mels is None: continue

            window = self.prepare_window(window)
            y = window.copy()
            window[:, :, window.shape[2]//2:] = 0.

            wrong_window = self.prepare_window(wrong_window)
            x = np.concatenate([window, wrong_window], axis=0)

            x = torch.FloatTensor(x)
            mel = torch.FloatTensor(mel.T).unsqueeze(0)
            indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1)
            y = torch.FloatTensor(y)
            # print(x.shape)
            return x, indiv_mels, mel, y
Пример #14
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:

    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    #world parameters
    f0, sp, ap = audio.world(wav, hparams.sample_rate)
    f0 = (f0 / hparams.f0_norm).astype(np.float32)  #normalize
    sp = audio._normalize(sp).astype(np.float32)
    ap = ap.astype(np.float32)  #apは0~1の範囲しか値を取らないので正規化不要
    world_frames = f0.shape[0]

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index

    f0_filename = 'ljspeech-f0-%05d.npy' % index
    sp_filename = 'ljspeech-sp-%05d.npy' % index
    ap_filename = 'ljspeech-ap-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, f0_filename), f0, allow_pickle=False)
    np.save(os.path.join(out_dir, sp_filename), sp, allow_pickle=False)
    np.save(os.path.join(out_dir, ap_filename), ap, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, f0_filename,
            sp_filename, ap_filename, world_frames, text)
    '''
Пример #15
0
def _process_utterance(out_dir, out_path, wav_path, text, stft):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    wav = wav / np.abs(wav).max() * 0.999
    #stft = audio.taco_stft()

    # delete the silence in back of the audio file.
    wav = librosa.effects.trim(wav,
                               top_db=23,
                               frame_length=1024,
                               hop_length=256)[0]

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav,
                                           stft).numpy().astype(np.float32)

    # Write the spectrograms to disk:
    # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    parts = out_path.strip().split('/')
    mel_filename = parts[4] + parts[5] + parts[6]
    o_path = os.path.join(parts[0], parts[1], parts[4])

    #    print(o_path)
    #    mel_filename = 'nam_speech-mel-%05d.npy' % index
    #  print(out_path)

    if (not os.path.exists(o_path)):
        os.mkdir(o_path)
    o_path = os.path.join(o_path, parts[5])
    if (not os.path.exists(o_path)):
        os.mkdir(o_path)
    o_path = os.path.join(o_path, parts[6])

    np.save(o_path, mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    # return (spectrogram_filename, mel_filename, n_frames, text)
    return (mel_filename, n_frames, text)
Пример #16
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    try:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    except Exception as e:
        print("Problem with :", wav_path)
        print(e)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # mdda added START :
    wav_filename = mel_filename.replace('-mel-', '-audio-')
    #wav_samples = hparams.fft_size + (n_frames-1)*hparams.hop_size  # No : 3 extra frames added : Don't bother chomping
    np.save(os.path.join(out_dir, wav_filename),
            wav.astype(np.float32),
            allow_pickle=False)
    spectrogramraw_filename = 'ljspeech-specraw-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogramraw_filename),
            spectrogram_raw(wav).T,
            allow_pickle=False)
    # mdda added END

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Пример #17
0
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
    # modified version of LJSpeech _process_utterance
    audio.set_hparams(hparams)

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate
    # Added from the multispeaker version
    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        wav = clean_by_phoneme(labels, wav, sr)
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)
    # End added from the multispeaker version

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.max_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) > hparams.max_audio_length:
        return None
    if hparams.min_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) < hparams.min_audio_length:
        return None

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]
    spectrogram_filename = 'spec-{}.npy'.format(wav_name)
    mel_filename = 'mel-{}.npy'.format(wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Пример #18
0
def _process_utterance(out_dir, text, wav_path, speaker_id=None):

    # check whether singlespeaker_mode
    if speaker_id is None:
        return _process_utterance_single(out_dir, text, wav_path)
    # modified version of VCTK _process_utterance
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]

    # case if wave files across different speakers have the same naming format.
    # e.g. Recording0.wav
    spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name)
    mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Пример #19
0
def _process_utterance(out_dir, index, wav_path, text, phone):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.vocoder=="world":
        spectrogram = audio.spectrogram(wav).astype(np.float32)

        f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate)
        ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate)
        sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim)
        
        world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded])
        n_frames = world_spec.shape[0]
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-world-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False)

    else:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

        # Write the spectrograms to disk:
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-mel-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False)


    # Return a tuple describing this training example:
    return (spectrogram_filename, encoded_filename, n_frames, text, phone)
Пример #20
0
def _extract_mel(wav_path):
    # Load the audio to a numpy array. Resampled if needed.
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0
    assert len(out) // N == audio.get_hop_size()

    timesteps = len(out)

    return out, mel_spectrogram, timesteps, out_dtype
Пример #21
0
def _process_utterance(out_dir, in_dir, label, speaker_name, hparams):
    wav_paths = glob.glob(os.path.join(in_dir, "*.wav"))
    if not wav_paths:
        return None

    total_utter_num = len(wav_paths)
    train_utter_num = (total_utter_num // 10) * 9
    print("[%s] train : %d, test : %d" %
          (speaker_name, train_utter_num, total_utter_num - train_utter_num))

    num_samples = len(wav_paths)
    npz_dir = os.path.join(out_dir, speaker_name)
    os.makedirs(npz_dir, exist_ok=True)

    # Train & Test path 설정
    train_path = os.path.join(npz_dir, "train")
    test_path = os.path.join(npz_dir, "test")
    os.makedirs(train_path, exist_ok=True)
    os.makedirs(test_path, exist_ok=True)

    for idx, wav_path in enumerate(wav_paths):
        wav_name, ext = os.path.splitext(os.path.basename(wav_path))
        if ext == ".wav":
            wav, sr = librosa.load(wav_path, sr=hparams.sample_rate)

            # rescale wav
            if hparams.rescaling:  # hparams.rescale = True
                wav = wav / np.abs(wav).max() * hparams.rescaling_max

            # M-AILABS extra silence specific
            if hparams.trim_silence:  # hparams.trim_silence = True
                wav = trim_silence(
                    wav, hparams)  # Trim leading and trailing silence

            mel = melspectrogram(wav, hparams)
            seq_len = wav.shape[0]
            frame_len = mel.shape[1]

            # data output dir
            if idx < train_utter_num:
                data_out_dir = train_path
            else:
                data_out_dir = test_path
            file_name = wav_name
            np.savez(os.path.join(data_out_dir, file_name),
                     mel=mel.T,
                     speaker=label,
                     seq_len=seq_len,
                     frame_len=frame_len)

    return num_samples
Пример #22
0
def process_video_file(vfile, args, split):
    video_stream = cv2.VideoCapture(vfile)
    frames = []
    while 1:
        still_reading, frame = video_stream.read()
        if not still_reading:
            video_stream.release()
            break
        frames.append(frame)
    mid_frames = []
    ss = 0.
    es = (ss + (window_size / 1000.))

    while int(es * fps) <= len(frames):
        mid_second = (ss + es) / 2.
        mid_frames.append(frames[int(mid_second * fps)])

        ss += (video_step_size_in_ms / 1000.)
        es = (ss + (window_size / 1000.))

    dst_subdir = path.join(
        vfile.split('/')[-2],
        vfile.split('/')[-1].split('.')[0])
    fulldir = path.join(args.final_data_root, split, dst_subdir)
    os.makedirs(fulldir, exist_ok=True)
    wavpath = path.join(fulldir, 'audio.wav')

    command = template.format(vfile, sr, wavpath)
    subprocess.call(command, shell=True)

    specpath = path.join(fulldir, 'mels.npz')

    if path.isfile(wavpath):
        wav = audio.load_wav(wavpath, sr)

        spec = audio.melspectrogram(wav)
        np.savez_compressed(specpath, spec=spec)
    else:
        return

    for i, f in enumerate(mid_frames):
        face, valid_frame = face_detect(f)

        if not valid_frame:
            continue

        resized_face = cv2.resize(face, (args.img_size, args.img_size))

        cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), resized_face)
Пример #23
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate
    filename = os.path.basename(wav_path).replace('.wav', '')

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    # Librosa trim seems to cut off the ending part of speech
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Save trimmed wav
    save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path)
    dir = os.path.dirname(save_wav_path)
    if not os.path.exists(dir):
        os.system('mkdir {} -p'.format(dir))
    audio.save_wav(wav, save_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = '{}-spec.npy'.format(filename)
    mel_filename = '{}-mel.npy'.format(filename)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Пример #24
0
def preprocess_test(speaker_id, speaker_file_name, start_num=200, data_path=hp.origin_data):
    out_dataset = hp.dataset_test_path
    if not os.path.exists(out_dataset):
        os.mkdir(out_dataset)

    file_path = os.path.join(data_path, speaker_file_name)
    wav_file_list = os.listdir(file_path)[start_num:]

    for utterance_id, wav_file in enumerate(wav_file_list):
        wav_file_path = os.path.join(file_path, wav_file)
        wav = audio.load_wav(wav_file_path)
        mel_spec = audio.melspectrogram(wav)

        save_file_name = str(speaker_id) + "_" + str(utterance_id) + ".npy"
        np.save(os.path.join(out_dataset, save_file_name), mel_spec)
Пример #25
0
def _process_utterance(out_dir, index, wav_path, text, silence_threshold,
                       fft_size):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Mu-law quantize
    quantized = P.mulaw_quantize(wav)

    # Trim silences
    start, end = audio.start_and_end_indices(quantized, silence_threshold)
    quantized = quantized[start:end]
    wav = wav[start:end]

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjast time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    quantized = np.pad(quantized, (l, r),
                       mode="constant",
                       constant_values=P.mulaw_quantize(0))
    N = mel_spectrogram.shape[0]
    assert len(quantized) >= N * audio.get_hop_size()

    # time resolution adjastment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    quantized = quantized[:N * audio.get_hop_size()]
    assert len(quantized) % audio.get_hop_size() == 0

    timesteps = len(quantized)

    wav_id = wav_path.split('/')[-1].split('.')[0]
    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    np.save(os.path.join(out_dir, audio_filename),
            quantized.astype(np.int16),
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)
Пример #26
0
def wavenet_data():
    out = P.mulaw_quantize(wav, hparams.quantize_channels)
    out8 = P.mulaw_quantize(wav, 256)
    # WAVENENT TRANFSORMATIONS
    # Mu-law quantize

    # Trim silences
    start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
    wav = wav[start:end]
    out = out[start:end]
    constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
    out_dtype = np.int16

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    import matplotlib.pyplot as plt

    plt.subplot(3, 1, 1)
    specshow(mel_spectrogram.T, sr=20000, hop_length=hparams.hop_size)
    plt.subplot(3, 1, 2)
    plt.plot(out)
    plt.xlim(0, len(out))
    plt.subplot(3, 1, 3)
    plt.plot(wav)
    plt.xlim(0, len(wav))
    plt.show()

    out /= out.max()
Пример #27
0
def _process_utterance(out_dir,
                       index,
                       speaker_id,
                       wav_path,
                       text,
                       hparams=hparams):
    sr = hparams.sample_rate
    audio.set_hparams(hparams)

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
    mel_filename = 'vctk-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Пример #28
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

    Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # rescale wav for unified measure for all clips
    wav = wav / np.abs(wav).max() * 0.999

    # trim silence
    wav = audio.trim_silence(wav)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    if n_frames > hp.max_frame_num:
        return None

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'biaobei-spec-%05d.npy' % index
    mel_filename = 'biaobei-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, pinyin)
Пример #29
0
def preprocess_test(speaker_id,
                    speaker_file_name,
                    class_num_remaining=50,
                    data_path=hp.origin_data):
    out_dataset = hp.dataset_test_path
    if not os.path.exists(out_dataset):
        os.mkdir(out_dataset)

    file_path = os.path.join(data_path, speaker_file_name)
    total_len = len(os.listdir(file_path))
    cut_length = total_len - class_num_remaining
    wav_file_list = os.listdir(file_path)[cut_length:]

    for utterance_id, wav_file in enumerate(wav_file_list):
        wav_file_path = os.path.join(file_path, wav_file)
        wav = audio.load_wav(wav_file_path)
        mel_spec = audio.melspectrogram(wav)

        save_file_name = str(speaker_id) + "_" + str(utterance_id) + ".npy"
        np.save(os.path.join(out_dataset, save_file_name), mel_spec)
Пример #30
0
def _process_utterance(wav_path):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    # n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Return a tuple describing this training example:
    return spectrogram.T, mel_spectrogram.T