def test_MCD_and_f0(): hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio_path = 'kakao/1/1_0001.wav' mel_path = 'kakao/1/1_0001.mel.npy' srcMel = torch.from_numpy(np.load(mel_path)).unsqueeze(0) srcMel = torch.clamp(srcMel, -4.0, 4.0) # print(srcMel.shape, srcMel.max(), srcMel.min()) audio, sr = load_wav_to_torch(audio_path) # print(audio.shape, audio.max(), audio.min()) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # print(audio_norm.shape, audio_norm.max(), audio_norm.min()) dstMel = stft.mel_spectrogram(audio_norm) # print(dstMel.shape, dstMel.max(), dstMel.min()) # mcc = stft.cepstrum_from_audio(audio_norm) # print('mcc', mcc.shape, mcc.max(), mcc.min()) log_MCD = MCD_from_mels(stft, srcMel, dstMel) print(log_MCD.data, 'log') sqrtDiffF0 = sqDiffF0_from_mels(stft, srcMel, dstMel) print(sqrtDiffF0) meanSqrtDiffF0 = torch.mean(sqrtDiffF0) print(meanSqrtDiffF0.data, '100hz')
def load_data(datapath, glob_file_str, scale=True, data_split=[0.8, 0.1]): data = defaultdict(list) stft = TacotronSTFT(filter_length=1024, hop_length=160, win_length=1024, sampling_rate=16000, n_mel_channels=64, mel_fmin=0, mel_fmax=None, representation='asrgen') for folderpath in sorted(glob.glob(os.path.join(datapath, '*/'))): label = os.path.basename(os.path.normpath(folderpath)) filepaths = glob.glob( os.path.join(os.path.join(datapath, label), glob_file_str)) for filepath in filepaths: audio = load_wav_to_torch(filepath, stft.sampling_rate) audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm / torch.max(audio_norm.abs()) audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) mel_spec = stft.mel_spectrogram(audio_norm)[0] mel_spec -= mel_spec.min() mel_spec = mel_spec / torch.max(mel_spec) mel_spec = (mel_spec * 2) - 1 train_end = int(mel_spec.size(1) * data_split[0]) val_end = int(mel_spec.size(1) * (data_split[0] + data_split[1])) data['train'].append([mel_spec[:, :train_end], label]) data['valid'].append([mel_spec[:, train_end:val_end], label]) data['test'].append([mel_spec[:, val_end:], label]) return data
class TextMelLoader(torch.utils.data.Dataset): """ 1) loads audio,text pairs 2) normalizes text and converts them to sequences of one-hot vectors 3) computes mel-spectrograms from audio files. """ def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) def get_mel_text_pair(self, audiopath_and_text): # separate filename and text audiopath, text = audiopath_and_text[0], audiopath_and_text[1] text = self.get_text(text) mel = self.get_mel(audiopath) return (text, mel) def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename)) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm def __getitem__(self, index): return self.get_mel_text_pair(self.audiopaths_and_text[index]) def __len__(self): return len(self.audiopaths_and_text)
def get_mel(filename, hparams): stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio = load_wav_to_torch(filename, hparams.sampling_rate) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(training_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
def load_mel(path): hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate) audio = torch.from_numpy(audio) if sampling_rate != hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = melspec.cpu() return melspec
class Synthesizer(object): def __init__(self): super().__init__() self.hparams = create_hparams() self.hparams.sampling_rate = 16000 self.hparams.max_decoder_steps = 600 self.stft = TacotronSTFT( self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax) def load_mel(self, path): audio, sampling_rate = load_wav_to_torch(path) if sampling_rate != self.hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = melspec.cuda() return melspec # def close(self): # tf.reset_default_graph() # self.sess.close() def load(self, checkpoint_path, waveglow_path): self.model = load_model(self.hparams) self.model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = self.model.eval() self.waveglow = torch.load(waveglow_path)['model'] self.waveglow.cuda() path = './web/static/uploads/koemo_spk_emo_all_test.txt' with open(path, encoding='utf-8') as f: filepaths_and_text = [line.strip().split("|") for line in f] base_path = os.path.dirname(checkpoint_path) data_path = os.path.basename(checkpoint_path) + '_' + path.rsplit('_', 1)[1].split('.')[0] + '.npz' npz_path = os.path.join(base_path, data_path) if os.path.exists(npz_path): d = np.load(npz_path) zs = d['zs'] emotions = d['emotions'] else: emotions = [] zs = [] for audio_path, _, _, emotion in tqdm(filepaths_and_text): melspec = self.load_mel(audio_path) _, _, _, z = self.model.vae_gst(melspec) zs.append(z.cpu().data) emotions.append(int(emotion)) emotions = np.array(emotions) # list이면 안됨 -> ndarray zs = torch.cat(zs, dim=0).data.numpy() d = {'zs':zs, 'emotions':emotions} np.savez(npz_path, **d) self.neu = np.mean(zs[emotions==0,:], axis=0) self.sad = np.mean(zs[emotions==1,:], axis=0) self.ang = np.mean(zs[emotions==2,:], axis=0) self.hap = np.mean(zs[emotions==3,:], axis=0) def synthesize(self, text, path, condition_on_ref, ref_audio, ratios): print(ratios) sequence = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() inputs = self.model.parse_input(sequence) transcript_embedded_inputs = self.model.transcript_embedding(inputs).transpose(1,2) transcript_outputs = self.model.encoder.inference(transcript_embedded_inputs) print(condition_on_ref) if condition_on_ref: #ref_audio = '/data1/jinhan/KoreanEmotionSpeech/wav/hap/hap_00000001.wav' ref_audio_mel = self.load_mel(ref_audio) latent_vector, _, _, _ = self.model.vae_gst(ref_audio_mel) latent_vector = latent_vector.unsqueeze(1).expand_as(transcript_outputs) else: # condition on emotion ratio latent_vector = ratios[0] * self.neu + ratios[1] * self.sad + \ ratios[2] * self.hap + ratios[3] * self.ang latent_vector = torch.FloatTensor(latent_vector).cuda() latent_vector = self.model.vae_gst.fc3(latent_vector) encoder_outputs = transcript_outputs + latent_vector decoder_input = self.model.decoder.get_go_frame(encoder_outputs) self.model.decoder.initialize_decoder_states(encoder_outputs, mask=None) mel_outputs, gate_outputs, alignments = [], [], [] while True: decoder_input = self.model.decoder.prenet(decoder_input) mel_output, gate_output, alignment = self.model.decoder.decode(decoder_input) mel_outputs += [mel_output] gate_outputs += [gate_output] alignments += [alignment] if torch.sigmoid(gate_output.data) > self.hparams.gate_threshold: # print(torch.sigmoid(gate_output.data), gate_output.data) break if len(mel_outputs) == self.hparams.max_decoder_steps: print("Warning! Reached max decoder steps") break decoder_input = mel_output mel_outputs, gate_outputs, alignments = self.model.decoder.parse_decoder_outputs( mel_outputs, gate_outputs, alignments) mel_outputs_postnet = self.model.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet # print(mel_outputs_postnet.shape) with torch.no_grad(): synth = self.waveglow.infer(mel_outputs, sigma=0.666) # return synth[0].data.cpu().numpy() # path = add_postfix(path, idx) # print(path) librosa.output.write_wav(path, synth[0].data.cpu().numpy(), 16000)
class LoadedMellotron: def __init__(self, ckpt, wglw, n_speakers=123): print("[Loading Model]") self.ckpt = ckpt self.hparams = create_hparams() self.hparams.n_speakers = n_speakers self.stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax) self.mellotron = load_model(self.hparams).cuda().eval() self.waveglow = torch.load(wglw)['model'].cuda().eval() self.denoiser = Denoiser(self.waveglow).cuda().eval() self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') self.mellotron.load_state_dict(torch.load(ckpt)['state_dict']) print('[Loaded Model]') def load_mel(self, path): audio, sampling_rate = librosa.core.load(path, sr=self.hparams.sampling_rate) audio = torch.from_numpy(audio) if sampling_rate != self.hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = melspec.cuda() return melspec def run( self, audio_path, text, title, speaker_id=0, ): print("[Running]") dataloader = TextMelLoader(audio_path, text, self.hparams, speaker_id) datacollate = TextMelCollate(1) text_encoded = torch.LongTensor( text_to_sequence(text, self.hparams.text_cleaners, self.arpabet_dict))[None, :].cuda() pitch_contour = dataloader.get_data()[3][None].cuda() mel = self.load_mel(audio_path) print(audio_path, text) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = self.mellotron.parse_batch(datacollate([dataloader.get_data()])) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = self.mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) s_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = self.mellotron.inference_noattention( (text_encoded, mel, s_id, pitch_contour, rhythm)) audio = self.denoiser( self.waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.02)[:, 0] # plot_mel_f0_alignment(x[2].data.cpu().numpy()[0], # mel_outputs_postnet.data.cpu().numpy()[0], # pitch_contour.data.cpu().numpy()[0, 0], # rhythm.data.cpu().numpy()[:, 0].T, f"tests/{title}.png") write(f"outputs/{title}", rate=self.hparams.sampling_rate, data=audio[0].data.cpu().numpy()) print("[END]")
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, audio_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(audio_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): #audio = audio + (torch.rand_like(audio) - 0.5) / MAX_WAV_VALUE # commenting out because why are we adding noise? audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): while(True): try: # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start+self.segment_length] # if the audio sample has a very small standard deviation, it's probably a bad sample audio_std = audio.std() if audio_std < 1e-4: raise ValueError("Sample low std deviation: {}".format(filename)) # try and detect silence with pydub audio_pydub = AudioSegment.from_wav(filename) audio_slice = audio_pydub[audio_start:audio_start + self.segment_length] if silence.detect_silence(audio_slice): raise ValueError("Sample too silent: {}".format(filename)) else: raise ValueError("Sample too short: {}".format(filename)) #audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data break except Exception as e: print(e) finally: index = randrange(0,len(self.audio_files)) mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)