def convert(predictor, df): a, b, c = next(df().get_data()) pred_spec, r_spec = predictor(a, b, c) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) r_spec = denormalize_db(r_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) r_spec = db2amp(r_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) r_spec = np.power(r_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.n_iter), pred_spec))) y_audio = np.array( list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.n_iter), r_spec))) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio
def convert(predictor, df): pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db) y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in pred_spec]) y_audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec]) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis) # if hp.Convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def convert(predictor, df): t = next(df().get_data()) print(t[0].shape) pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) librosa.output.write_wav( '/home/user/vilin/deep-voice-conversion/output/file_trim_8.wav', audio[0], hp.default.sr) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def convert(predictor, data): x_mfccs, y_spec, y_mel = data x_mfccs, y_spec, y_mel = data x_mfccs = np.array(x_mfccs).reshape((-1, ) + x_mfccs.shape) y_spec = np.array(y_spec).reshape((-1, ) + y_spec.shape) y_mel = np.array(y_mel).reshape((-1, ) + y_mel.shape) pred_spec, y_spec, ppgs = predictor(x_mfccs, y_spec, y_mel) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db) y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array([ spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in pred_spec ]) y_audio = np.array([ spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec ]) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis) if hp.Convert.one_full_wav: # Concatenate to a wav y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def predict(self, path_to_wav): x_mfcc, y_spec, mel = get_mfccs_and_spectrogram(path_to_wav) pred_spec, y_spec, ppgs = self.predictor(x_mfcc, y_spec, mel) pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec))
def convert(predictor, df): # TODO need to fix reading in with duration pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) if hp.convert.one_full_wav: # Concatenate to a wav y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def sumimage(mel, mel_name): mel = mel #+ 0.001 * np.random.standard_normal([hp.batch_size, hp.duration * hp.n_mels, hp.n_mels]) mel_image = mel.transpose(0, 2, 1) heatmap = np.expand_dims(mel_image, 3) tf.summary.image(mel_name, heatmap, max_outputs=mel_image.shape[0]) mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) mel_basis = np.mat(mel_basis) mel_basis_I = mel_basis.I mel_spec = [] for i in range(len(mel)): print(mel_name) print(np.max(mel[i])) print(np.min(mel[i])) print(np.mean(mel[i])) #mel[i] = mel[i] * (0.6 / np.max(mel[i])) mel_db_item = np.transpose(mel[i]) mel_db_item = denormalize_0_1(mel_db_item, hp.max_db, hp.min_db) #mel_db_item = np.maximum(mel_db_item, 0) # = normalize_0_1(mel_db_item, hp.default.max_db, hp.default.min_db) print(np.max(mel_db_item)) print(np.mean(mel_db_item)) mel_item = db2amp(mel_db_item) print(np.max(mel_item)) mag_item = np.dot(mel_basis_I, mel_item) print(np.max(mel_item)) mag_item = np.maximum(mag_item, 0) spec_item = np.transpose(mag_item) #mag_db_item = amp2db(mag_item) #mag_db_item = normalize_0_1(mag_db_item, hp.default.max_db, hp.default.min_db) #mag_db_item = np.transpose(mag_db_item) #specitem = np.transpose(magitem) #mel_complex = mel_D_abs + np.complex(0, 0) #specitem = librosa.istft(stft_matrix=mel_complex, hop_length=hp.default.hop_length, win_length=hp.default.win_length) mel_spec.append(spec_item.getA()) mel_spec = np.power(mel_spec, hp.emphasis_magnitude) mel_audio = np.array( list( map( lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp. hop_length, hp.n_iter), mel_spec))) mel_audio = inv_preemphasis(mel_audio, coeff=hp.preemphasis) tf.summary.audio(mel_name, mel_audio, hp.sr, max_outputs=hp.batch_size)
def sumspecimage(spec, spec_name): spec = denormalize_db(spec, hp.max_db, hp.min_db) spec = db2amp(spec) spec_image = spec.transpose(0, 2, 1) heatmap = np.expand_dims(spec_image, 3) tf.summary.image(spec_name, heatmap, max_outputs=spec_image.shape[0]) out_spec = np.power(np.maximum(spec, 0), 1) #hp.emphasis_magnitude) out_audio = np.array( list( map( lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp. hop_length, hp.n_iter), out_spec))) out_audio = inv_preemphasis(out_audio, coeff=hp.preemphasis) tf.summary.audio(spec_name, out_audio, hp.sr, max_outputs=hp.batch_size)
def convert(predictor, tensor): # tensor = next(df().get_data()) # print(tensor.shape) pred_spec, y_spec, ppgs = predictor(tensor) # pred_spec, y_spec, ppgs = predictor(tf.expand_dims(df, 0)) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) # y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) # y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) # y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) # y_audio = np.array(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, # hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) # y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # pickle.dump( y_audio, open( "y-audio.p", "wb" ) ) # pickle.dump( audio, open( "o-audio.p", "wb" ) ) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') # return audio, y_audio, ppgs return audio, ppgs
import numpy as np import matplotlib.pyplot as plt from audio import spec2wav, wav2spec, read_wav, write_wav if __name__ == '__main__': sr = 22050 n_fft = 512 win_length = 400 hop_length = 80 duration = 2 # sec wav = read_wav( "H:\\cs230\\wav_x\\1_1.wav", sr, duration ) spec, _ = wav2spec(wav, n_fft, win_length, hop_length, False) converted_wav = spec2wav(spec, n_fft, win_length, hop_length, 600) write_wav(converted_wav, sr, 'a.wav') plt.pcolormesh(spec) plt.ylabel('Frequency') plt.xlabel('Time') plt.savefig("a.png")