def _process_utterance(out_dir, in_dir, source_wav_name, target_wav_name, emotion_id): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: source_wav = audio.load_wav(os.path.join(in_dir, source_wav_name)) target_wav = audio.load_wav(os.path.join(in_dir, target_wav_name)) if hparams.rescaling: source_wav = source_wav / np.abs( source_wav).max() * hparams.rescaling_max target_wav = target_wav / np.abs( target_wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: #s_spectrogram = audio.spectrogram(source_wav).astype(np.float32) t_spectrogram = audio.spectrogram(target_wav).astype(np.float32) # Compute a mel-scale spectrogram from the wav: smel_spectrogram = audio.melspectrogram(source_wav).astype(np.float32) tmel_spectrogram = audio.melspectrogram(target_wav).astype(np.float32) s_n_frames = smel_spectrogram.shape[1] t_n_frames = tmel_spectrogram.shape[1] # Write the spectrograms to disk: #s_spectrogram_filename = 'source-spec-{}.npy'.format(source_wav_name) t_spectrogram_filename = 'target-spec-{}.npy'.format( target_wav_name.replace('.wav', '')) smel_filename = 'source-mel-{}.npy'.format( source_wav_name.replace('.wav', '')) tmel_filename = 'target-mel-{}.npy'.format( target_wav_name.replace('.wav', '')) #np.save(os.path.join(out_dir, s_spectrogram_filename), s_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, t_spectrogram_filename), t_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, smel_filename), smel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tmel_filename), tmel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (emotion_id, t_spectrogram_filename, smel_filename, tmel_filename, s_n_frames, t_n_frames)
def gen_data(audio_path, full_frames): wav = audio.load_wav(audio_path, 16000) mel = audio.melspectrogram(wav) print(mel.shape) if np.isnan(mel.reshape(-1)).sum() > 0: raise ValueError( 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again' ) mel_chunks = [] mel_idx_multiplier = 80. / fps i = 0 while 1: start_idx = int(i * mel_idx_multiplier) if start_idx + mel_step_size > len(mel[0]): mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) break mel_chunks.append(mel[:, start_idx:start_idx + mel_step_size]) i += 1 print("Length of mel chunks: {}".format(len(mel_chunks))) full_frames = full_frames[:len(mel_chunks)] gen = datagen(full_frames.copy(), mel_chunks) return gen
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'nikl-single-spec-%05d.npy' % index mel_filename = 'nikl-single-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def __getitem__(self, index): # Read audio filename = self.audio_files[index] wav = deepaudio.load_wav(filename) # load in raw_audio via utils raw_audio, _ = utils.load_wav_to_torch(filename) # convert wav to numpy audio = torch.from_numpy(wav) # take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] # update raw audio as well raw_audio = raw_audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data # pad raw audio as well raw_audio = torch.nn.functional.pad( raw_audio, (0, self.segment_length - raw_audio.size(0)), 'constant').data # compute mel mel = deepaudio.melspectrogram(audio.numpy()) # convert mel to torch mel = torch.from_numpy(mel) audio = utils.mu_law_encode(raw_audio / utils.MAX_WAV_VALUE, self.mu_quantization) return (mel, audio)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'vctk-spec-%05d.npy' % index mel_filename = 'vctk-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return spectrogram_filename, mel_filename, n_frames, text, speaker_id
def _process_utterance(out_dir, wav_path): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return mel_spectrogram.astype(np.float32)
def process(info_dict): wav_path = os.path.join(hp.data_path, "Wave") wav_file_name = os.path.join(wav_path, info_dict["sentence_id"]+".wav") wav = audio.load_wav(wav_file_name) mel = audio.melspectrogram(wav).T mel_file_path = os.path.join(hp.mel_path, info_dict["sentence_id"]+".npy") np.save(mel_file_path, mel) phone_idx = info_dict["sentence_id"] + "|" for phone_duration in info_dict["alignment"]: phone_idx += str(phone_map[phone_duration[0]]) + " " duration_idx = info_dict["sentence_id"] + "|" length_mel = mel.shape[0] length_phone_list = len(info_dict["alignment"]) cur_pointer = 0 for frame_id in range(length_mel): added = False cur_time = hp.frame_length_ms / 2 + frame_id * hp.frame_shift_ms cur_time = cur_time / 1000.0 for i in range(cur_pointer, length_phone_list): if cur_time >= info_dict["alignment"][i][1][0] and cur_time < info_dict["alignment"][i][1][1]: phone_id = phone_map[info_dict["alignment"][i][0]] duration_idx += str(phone_id) + " " cur_pointer = i added = True break if not added: phone_id = phone_map[info_dict["alignment"][cur_pointer][0]] duration_idx += str(phone_id) + " " return phone_idx[:-1], duration_idx[:-1]
def extract_MFCC_and_text(wav_file_path, mfcc_dir): wav_filenames = glob.glob(wav_file_path) for wav_fname in wav_filenames: text_filename = wav_fname.replace(".WAV.wav", ".TXT") fullname = wav_fname.split('/')[-1] fname = fullname.split('.')[0] # Process the text: remove the first two numbers from the text file with open(text_filename, 'r') as file: sentence = file.read() sentence = sentence.split()[2:] + ['\n'] sentence = ' '.join(sentence).lower() # Write the prcoeesed text to the mfcc directory text_fname = mfcc_dir + '/' + fname + '.txt' with open(text_fname, "w") as file: file.write(sentence) # Generate the MFCC features wav = audio.load_wav(wav_fname) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mspec_fname = mfcc_dir + '/' + fname np.save(mspec_fname, mel_spectrogram, allow_pickle=False) #generates features of shape: L x 80 return
def _process_utterance(out_dir, in_dir, label, speaker_name, hparams): wav_paths = glob.glob(os.path.join(in_dir, "*.wav")) if not wav_paths: return None num_samples = len(wav_paths) npz_dir = os.path.join(out_dir, speaker_name) os.makedirs(npz_dir, exist_ok=True) for idx, wav_path in enumerate(wav_paths): wav_name, ext = os.path.splitext(os.path.basename(wav_path)) if ext == ".wav": wav, sr = librosa.load(wav_path, sr=hparams.sample_rate) # rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = trim_silence(wav, hparams) # Trim leading and trailing silence mel = melspectrogram(wav, hparams) seq_len = wav.shape[0] frame_len = mel.shape[1] file_name = wav_name np.savez(os.path.join(out_dir, file_name), mel=mel.T, speaker=label, seq_len=seq_len, frame_len=frame_len) return num_samples
def gen_samples(out_dir, wav_path, n_samples): wav = audio.load_wav(wav_path) hop_size = hparams.hop_length seg_len = hparams.seg_len spec_len = hparams.spec_len # not sure why we have to minus 1 here ? wav_len = wav.shape[0] // hop_size * hop_size - 1 wav = wav[:wav_len] spec = audio.spectrogram(wav) mel = audio.melspectrogram(wav) max_val = spec.shape[1] - 1 - spec_len if max_val < 0: return [] idx = np.random.randint(0, max_val, size=(n_samples)) d = [] i = 0 for offset in idx: i += 1 w = wav[offset * hop_size:offset * hop_size + seg_len] s = spec[:, offset:offset + spec_len] m = mel[:, offset:offset + spec_len] wav_name = wav_path.split('/')[-1].split('.')[0] file_path = "{0}/{1}_{2:03d}.npz".format(out_dir, wav_name, i) np.savez(file_path, wav=w, spec=s, mel=m) d.append(file_path) return d
def test(): wavs_path = os.path.join("data", "LJSpeech-1.1") wavs_path = os.path.join(wavs_path, "wavs") wav_path = os.path.join(wavs_path, "LJ001-0001.wav") wav = audio.load_wav(wav_path) mel_spec = audio.melspectrogram(wav) wav_after_inv = audio.inv_mel_spectrogram(mel_spec) audio.save_wav(wav_after_inv, "test.wav")
def __getitem__(self, idx): while 1: idx = random.randint(0, len(self.all_videos) - 1) vidname = self.all_videos[idx] img_names = list(glob(join(vidname, '*.jpg'))) if len(img_names) <= 3 * syncnet_T: continue img_name = random.choice(img_names) wrong_img_name = random.choice(img_names) while wrong_img_name == img_name: wrong_img_name = random.choice(img_names) window_fnames = self.get_window(img_name) wrong_window_fnames = self.get_window(wrong_img_name) if window_fnames is None or wrong_window_fnames is None: continue window = self.read_window(window_fnames) if window is None: continue wrong_window = self.read_window(wrong_window_fnames) if wrong_window is None: continue try: wavpath = join(vidname, "audio.wav") if wavpath not in self.shared_dict: wav = audio.load_wav(wavpath, hparams.sample_rate) orig_mel = audio.melspectrogram(wav).T self.shared_dict[wavpath] = orig_mel else: orig_mel = self.shared_dict[wavpath] except Exception as e: continue mel = self.crop_audio_window(orig_mel.copy(), img_name) if (mel.shape[0] != syncnet_mel_step_size): continue indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name) if indiv_mels is None: continue window = self.prepare_window(window) y = window.copy() window[:, :, window.shape[2]//2:] = 0. wrong_window = self.prepare_window(wrong_window) x = np.concatenate([window, wrong_window], axis=0) x = torch.FloatTensor(x) mel = torch.FloatTensor(mel.T).unsqueeze(0) indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1) y = torch.FloatTensor(y) # print(x.shape) return x, indiv_mels, mel, y
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) #world parameters f0, sp, ap = audio.world(wav, hparams.sample_rate) f0 = (f0 / hparams.f0_norm).astype(np.float32) #normalize sp = audio._normalize(sp).astype(np.float32) ap = ap.astype(np.float32) #apは0~1の範囲しか値を取らないので正規化不要 world_frames = f0.shape[0] # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index f0_filename = 'ljspeech-f0-%05d.npy' % index sp_filename = 'ljspeech-sp-%05d.npy' % index ap_filename = 'ljspeech-ap-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, f0_filename), f0, allow_pickle=False) np.save(os.path.join(out_dir, sp_filename), sp, allow_pickle=False) np.save(os.path.join(out_dir, ap_filename), ap, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, f0_filename, sp_filename, ap_filename, world_frames, text) '''
def _process_utterance(out_dir, out_path, wav_path, text, stft): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav = wav / np.abs(wav).max() * 0.999 #stft = audio.taco_stft() # delete the silence in back of the audio file. wav = librosa.effects.trim(wav, top_db=23, frame_length=1024, hop_length=256)[0] # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav, stft).numpy().astype(np.float32) # Write the spectrograms to disk: # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index parts = out_path.strip().split('/') mel_filename = parts[4] + parts[5] + parts[6] o_path = os.path.join(parts[0], parts[1], parts[4]) # print(o_path) # mel_filename = 'nam_speech-mel-%05d.npy' % index # print(out_path) if (not os.path.exists(o_path)): os.mkdir(o_path) o_path = os.path.join(o_path, parts[5]) if (not os.path.exists(o_path)): os.mkdir(o_path) o_path = os.path.join(o_path, parts[6]) np.save(o_path, mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: # return (spectrogram_filename, mel_filename, n_frames, text) return (mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max try: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) except Exception as e: print("Problem with :", wav_path) print(e) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # mdda added START : wav_filename = mel_filename.replace('-mel-', '-audio-') #wav_samples = hparams.fft_size + (n_frames-1)*hparams.hop_size # No : 3 extra frames added : Don't bother chomping np.save(os.path.join(out_dir, wav_filename), wav.astype(np.float32), allow_pickle=False) spectrogramraw_filename = 'ljspeech-specraw-%05d.npy' % index np.save(os.path.join(out_dir, spectrogramraw_filename), spectrogram_raw(wav).T, allow_pickle=False) # mdda added END # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) < hparams.min_audio_length: return None # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] spectrogram_filename = 'spec-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, text, wav_path, speaker_id=None): # check whether singlespeaker_mode if speaker_id is None: return _process_utterance_single(out_dir, text, wav_path) # modified version of VCTK _process_utterance sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] # case if wave files across different speakers have the same naming format. # e.g. Recording0.wav spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name) mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def _process_utterance(out_dir, index, wav_path, text, phone): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.vocoder=="world": spectrogram = audio.spectrogram(wav).astype(np.float32) f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate) ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate) sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim) world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded]) n_frames = world_spec.shape[0] spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-world-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False) else: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def _process_utterance(out_dir, in_dir, label, speaker_name, hparams): wav_paths = glob.glob(os.path.join(in_dir, "*.wav")) if not wav_paths: return None total_utter_num = len(wav_paths) train_utter_num = (total_utter_num // 10) * 9 print("[%s] train : %d, test : %d" % (speaker_name, train_utter_num, total_utter_num - train_utter_num)) num_samples = len(wav_paths) npz_dir = os.path.join(out_dir, speaker_name) os.makedirs(npz_dir, exist_ok=True) # Train & Test path 설정 train_path = os.path.join(npz_dir, "train") test_path = os.path.join(npz_dir, "test") os.makedirs(train_path, exist_ok=True) os.makedirs(test_path, exist_ok=True) for idx, wav_path in enumerate(wav_paths): wav_name, ext = os.path.splitext(os.path.basename(wav_path)) if ext == ".wav": wav, sr = librosa.load(wav_path, sr=hparams.sample_rate) # rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = trim_silence( wav, hparams) # Trim leading and trailing silence mel = melspectrogram(wav, hparams) seq_len = wav.shape[0] frame_len = mel.shape[1] # data output dir if idx < train_utter_num: data_out_dir = train_path else: data_out_dir = test_path file_name = wav_name np.savez(os.path.join(data_out_dir, file_name), mel=mel.T, speaker=label, seq_len=seq_len, frame_len=frame_len) return num_samples
def process_video_file(vfile, args, split): video_stream = cv2.VideoCapture(vfile) frames = [] while 1: still_reading, frame = video_stream.read() if not still_reading: video_stream.release() break frames.append(frame) mid_frames = [] ss = 0. es = (ss + (window_size / 1000.)) while int(es * fps) <= len(frames): mid_second = (ss + es) / 2. mid_frames.append(frames[int(mid_second * fps)]) ss += (video_step_size_in_ms / 1000.) es = (ss + (window_size / 1000.)) dst_subdir = path.join( vfile.split('/')[-2], vfile.split('/')[-1].split('.')[0]) fulldir = path.join(args.final_data_root, split, dst_subdir) os.makedirs(fulldir, exist_ok=True) wavpath = path.join(fulldir, 'audio.wav') command = template.format(vfile, sr, wavpath) subprocess.call(command, shell=True) specpath = path.join(fulldir, 'mels.npz') if path.isfile(wavpath): wav = audio.load_wav(wavpath, sr) spec = audio.melspectrogram(wav) np.savez_compressed(specpath, spec=spec) else: return for i, f in enumerate(mid_frames): face, valid_frame = face_detect(f) if not valid_frame: continue resized_face = cv2.resize(face, (args.img_size, args.img_size)) cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), resized_face)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate filename = os.path.basename(wav_path).replace('.wav', '') # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) # Librosa trim seems to cut off the ending part of speech else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Save trimmed wav save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path) dir = os.path.dirname(save_wav_path) if not os.path.exists(dir): os.system('mkdir {} -p'.format(dir)) audio.save_wav(wav, save_wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}-spec.npy'.format(filename) mel_filename = '{}-mel.npy'.format(filename) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def preprocess_test(speaker_id, speaker_file_name, start_num=200, data_path=hp.origin_data): out_dataset = hp.dataset_test_path if not os.path.exists(out_dataset): os.mkdir(out_dataset) file_path = os.path.join(data_path, speaker_file_name) wav_file_list = os.listdir(file_path)[start_num:] for utterance_id, wav_file in enumerate(wav_file_list): wav_file_path = os.path.join(file_path, wav_file) wav = audio.load_wav(wav_file_path) mel_spec = audio.melspectrogram(wav) save_file_name = str(speaker_id) + "_" + str(utterance_id) + ".npy" np.save(os.path.join(out_dataset, save_file_name), mel_spec)
def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def wavenet_data(): out = P.mulaw_quantize(wav, hparams.quantize_channels) out8 = P.mulaw_quantize(wav, 256) # WAVENENT TRANFSORMATIONS # Mu-law quantize # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) import matplotlib.pyplot as plt plt.subplot(3, 1, 1) specshow(mel_spectrogram.T, sr=20000, hop_length=hparams.hop_size) plt.subplot(3, 1, 2) plt.plot(out) plt.xlim(0, len(out)) plt.subplot(3, 1, 3) plt.plot(wav) plt.xlim(0, len(wav)) plt.show() out /= out.max()
def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams=hparams): sr = hparams.sample_rate audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'vctk-spec-%05d.npy' % index mel_filename = 'vctk-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def _process_utterance(out_dir, index, wav_path, pinyin): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # rescale wav for unified measure for all clips wav = wav / np.abs(wav).max() * 0.999 # trim silence wav = audio.trim_silence(wav) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] if n_frames > hp.max_frame_num: return None # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'biaobei-spec-%05d.npy' % index mel_filename = 'biaobei-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pinyin)
def preprocess_test(speaker_id, speaker_file_name, class_num_remaining=50, data_path=hp.origin_data): out_dataset = hp.dataset_test_path if not os.path.exists(out_dataset): os.mkdir(out_dataset) file_path = os.path.join(data_path, speaker_file_name) total_len = len(os.listdir(file_path)) cut_length = total_len - class_num_remaining wav_file_list = os.listdir(file_path)[cut_length:] for utterance_id, wav_file in enumerate(wav_file_list): wav_file_path = os.path.join(file_path, wav_file) wav = audio.load_wav(wav_file_path) mel_spec = audio.melspectrogram(wav) save_file_name = str(speaker_id) + "_" + str(utterance_id) + ".npy" np.save(os.path.join(out_dataset, save_file_name), mel_spec)
def _process_utterance(wav_path): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Return a tuple describing this training example: return spectrogram.T, mel_spectrogram.T