def get_random_wav_and_label(self, tar_wavfiles, ntar_wavfiles): """ :return: wav: raw wave. float32. shape=(t, ), label: 1 if target, 0 otherwise. int32. melspec: mel-spectrogram. float32. shape=(t, n_mels) """ wavfiles, label = ( tar_wavfiles, self.tar_labels) if np.random.sample(1) <= self.tar_ratio else ( ntar_wavfiles, self.ntar_labels) wavfile = wavfiles[np.random.randint(0, len(wavfiles))] if type(wavfile) == bytes: wavfile = wavfile.decode() if wavfile.endswith('arr'): # pyarrow format wav = read_wav_from_arr(wavfile) else: wav = read_wav(wavfile, sr=hp.signal.sr) wav = trim_wav(wav) wav = crop_random_wav(wav, self.length) wav = augment_volume(wav) wav = fix_length(wav, self.length) # padding melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) melspec = np.float32(melspec) label = np.float32(label) return wav, melspec, label
def _get_wav_and_melspec(wav_file, length, is_training=True): ''' the range of values of wav is [-1, 1]. ''' wav = read_wav(wav_file, sr=hp.signal.sr) wav = trim_wav(wav) # divide wav into chunks that have the given length and one is randomly selected in training, but first chunk in generation. n_clips = math.ceil(len(wav) / length) if is_training else 1 idx = random.randrange(n_clips) start, end = length * idx, length * (idx + 1) wav = wav[start:end] assert (len(wav) <= length) wav = fix_length(wav, length) # padding in case of last chunk. melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) wav = np.expand_dims(wav, -1) return wav, melspec.astype(np.float32)
def _load_random_wav(self, speaker_id): wavfile = self.audio_meta.get_random_audio(speaker_id) wav = read_wav(wavfile, hp.signal.sr) # wav = trim_wav(wav) length = int(hp.signal.duration * hp.signal.sr) wav = crop_random_wav(wav, length=length) wav = fix_length(wav, length, mode='reflect') return wav # (t, n_mel)
def get_random_wav(self, wavfile): """ :param: wavfile: a raw wave file. :return: wav: raw wave. float32. shape=(t, ), melspec: mel-spectrogram. float32. shape=(t, n_mels), wavfile: the raw wave file. """ wav = read_wav(wavfile, sr=hp.signal.sr) wav = trim_wav(wav) wav = fix_length(wav, self.length) # crop from the beginning. melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) melspec = np.float32(melspec) return wav, melspec, wavfile
def _get_wav_and_melspec(wav_file, length=None, is_training=True): wav = read_wav(wav_file, sr=hp.signal.sr) wav = trim_wav(wav) if length: n_clips = math.ceil(len(wav) / length) if is_training else 1 idx = random.randrange(n_clips) start, end = length * idx, length * (idx + 1) wav = wav[start:end] assert (len(wav) <= length) wav = fix_length(wav, length) # padding melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) wav = np.expand_dims(wav, -1) return wav, melspec.astype(np.float32)
ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(hp.logdir) pred_conf = PredictConfig( model=model, input_names=['x'], output_names=['embedding/embedding', 'prediction'], session_init=SaverRestore(ckpt) if ckpt else None) embedding_pred = OfflinePredictor(pred_conf) embedding, pred_speaker_id = embedding_pred(mel_spec) # get a random audio of the predicted speaker. wavfile_pred_speaker = np.array(map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id)) length = int(hp.signal.duration * hp.signal.sr) wav_pred_speaker = np.array( map(lambda w: fix_length(read_wav(w, hp.signal.sr, duration=hp.signal.duration), length), wavfile_pred_speaker)) # write audio tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10) tf.summary.audio('wav_pred', wav_pred_speaker, hp.signal.sr, max_outputs=10) # write prediction speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id] pred_speaker_name = [audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id] meta = [tuple(audio_meta.meta_dict[sid][k] for k in audio_meta.target_meta_field()) for sid in speaker_id] if hp.embed.meta_path else None pred_meta = [tuple(audio_meta_train.meta_dict[sid][k] for k in audio_meta_train.target_meta_field()) for sid in pred_speaker_id] if hp.train.meta_path else None prediction = ['{} ({}) -> {} ({})'.format(s, s_meta, p, p_meta) for s, p, s_meta, p_meta in zip(speaker_name, pred_speaker_name, meta, pred_meta)] tf.summary.text('prediction', tf.convert_to_tensor(prediction))
output_names=['embedding/embedding', 'prediction'], session_init=SaverRestore(ckpt) if ckpt else None, ) embedding_pred = OfflinePredictor(pred_conf) embedding, pred_speaker_id = embedding_pred(mel_spec) # get a random audio of the predicted speaker. wavfile_pred_speaker = np.array( map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id)) length = int(hp.signal.duration * hp.signal.sr) wav_pred_speaker = np.array( map( lambda w: fix_length( read_wav(w, hp.signal.sr, duration=hp.signal.duration), length ), wavfile_pred_speaker)) # write audio tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10) tf.summary.audio('wav_pred', wav_pred_speaker, hp.signal.sr, max_outputs=10) # write prediction speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id] pred_speaker_name = [ audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id ]