示例#1
0
def load_data(datapath, glob_file_str, scale=True, data_split=[0.8, 0.1]):
    data = defaultdict(list)
    stft = TacotronSTFT(filter_length=1024,
                        hop_length=160,
                        win_length=1024,
                        sampling_rate=16000,
                        n_mel_channels=64,
                        mel_fmin=0,
                        mel_fmax=None,
                        representation='asrgen')

    for folderpath in sorted(glob.glob(os.path.join(datapath, '*/'))):
        label = os.path.basename(os.path.normpath(folderpath))
        filepaths = glob.glob(
            os.path.join(os.path.join(datapath, label), glob_file_str))
        for filepath in filepaths:
            audio = load_wav_to_torch(filepath, stft.sampling_rate)
            audio_norm = audio / MAX_WAV_VALUE
            audio_norm = audio_norm / torch.max(audio_norm.abs())
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            mel_spec = stft.mel_spectrogram(audio_norm)[0]
            mel_spec -= mel_spec.min()
            mel_spec = mel_spec / torch.max(mel_spec)
            mel_spec = (mel_spec * 2) - 1
            train_end = int(mel_spec.size(1) * data_split[0])
            val_end = int(mel_spec.size(1) * (data_split[0] + data_split[1]))
            data['train'].append([mel_spec[:, :train_end], label])
            data['valid'].append([mel_spec[:, train_end:val_end], label])
            data['test'].append([mel_spec[:, val_end:], label])
    return data
def mel_spectrogram_and_waveform_generation(checkpoint_path, text, hparams):

    # Griffin Lim iterations
    n_iter = 60
    # #### Load model from checkpoint
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.eval()

    # #### Prepare text input
    #text = "amor é fogo que arde sem se ver."
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    # #### Decode text input

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                           taco_stft.stft_fn, n_iter)
    return waveform
示例#3
0
    def __init__(self,
                 training_files,
                 val_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 val_flag=False):
        self.audio_files = files_to_list(training_files)
        if val_flag:
            self.audio_files = files_to_list(val_files)
        i = 0
        for file in self.audio_files:
            audio_data, sample_r = load_wav_to_torch(file)

            if audio_data.size(0) < sampling_rate:
                i += 1
                self.audio_files.remove(file)
        print("{} files shorter than segment_len".format(i))
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
示例#4
0
def test_MCD_and_f0():
    hparams = create_hparams()
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio_path = 'kakao/1/1_0001.wav'
    mel_path = 'kakao/1/1_0001.mel.npy'
    srcMel = torch.from_numpy(np.load(mel_path)).unsqueeze(0)
    srcMel = torch.clamp(srcMel, -4.0, 4.0)
    # print(srcMel.shape,  srcMel.max(), srcMel.min())
    audio, sr = load_wav_to_torch(audio_path)
    # print(audio.shape, audio.max(), audio.min())
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)

    # print(audio_norm.shape, audio_norm.max(), audio_norm.min())
    dstMel = stft.mel_spectrogram(audio_norm)
    # print(dstMel.shape, dstMel.max(), dstMel.min())
    # mcc = stft.cepstrum_from_audio(audio_norm)
    # print('mcc', mcc.shape, mcc.max(), mcc.min())

    log_MCD = MCD_from_mels(stft, srcMel, dstMel)
    print(log_MCD.data, 'log')

    sqrtDiffF0 = sqDiffF0_from_mels(stft, srcMel, dstMel)
    print(sqrtDiffF0)
    meanSqrtDiffF0 = torch.mean(sqrtDiffF0)
    print(meanSqrtDiffF0.data, '100hz')
示例#5
0
def test(hparams,
         mel,
         output_path="test.wav",
         ref_level_db=20,
         magnitude_power=1.5):
    taco_stft = TacotronSTFT(hparams)
    stime = time.time()
    mel_decompress = mel_denormalize(mel).unsqueeze(0)
    mel_decompress = taco_stft.spectral_de_normalize(mel_decompress +
                                                     ref_level_db)**(
                                                         1 / magnitude_power)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]),
                           taco_stft.stft_fn, 60)
    waveform = waveform[0].data.cpu().numpy()
    waveform = waveform / abs(waveform).max() * 0.99 * 2**15
    waveform = waveform.astype(dtype=np.int16)
    dec_time = time.time() - stime
    len_audio = float(len(waveform)) / float(hparams.sampling_rate)
    str = "audio length: {:.2f} sec,  mel_to_wave time: {:.2f}".format(
        len_audio, dec_time)
    print(str)
    write(os.path.join(output_path), hparams.sampling_rate, waveform)
示例#6
0
def inference_texts(model,
                    hp,
                    target_texts,
                    step,
                    model_name,
                    vocoder,
                    waveglow,
                    f_type='mel',
                    _type='train',
                    postnet=True):
    model.eval()
    for param in model.parameters():
        param.requires_grad = False
    sample_rate = 22050
    original_audio, texts = target_texts
    save_target = 'generate/{}-step-{}'.format(model_name, step)
    stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length,
                        hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin,
                        hp.mel_fmax)

    os.makedirs(save_target, exist_ok=True)
    for i, text in enumerate(texts):
        print(text)
        if original_audio:
            target_name = '{}-target-{}.wav'.format(_type, i)
            path = os.path.join(save_target, target_name)
            shutil.copy2(
                original_audio[i],
                path,
            )
        inputs = prepare_inputs(hp, text)
        if torch.cuda.device_count() > 1:
            with torch.no_grad():
                predict = model.module.inference(inputs, postnet=postnet)
        else:
            with torch.no_grad():
                predict = model.inference(inputs, postnet=postnet)
        name = '{}-{}-{}-{}.wav'.format(_type, f_type, i, vocoder)

        path = os.path.join(save_target, name)
        if vocoder == 'griffin_lim':
            mel_decompress = stft.spectral_de_normalize(predict)
            mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
            spec_from_mel_scaling = 1000
            spec_from_mel = torch.mm(mel_decompress[0], stft.mel_basis)
            spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
            spec_from_mel = spec_from_mel * spec_from_mel_scaling
            print(spec_from_mel.size())
            waveform = griffin_lim(
                torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                stft.stft_fn, 60)
            write(path, sample_rate, waveform[0].data.cpu().numpy())
        elif vocoder == 'waveglow' and waveglow:
            with torch.no_grad():
                audio = MAX_WAV_VALUE * waveglow.infer(predict, sigma=1.0)[0]
            audio = audio.cpu().numpy()
            audio = audio.astype('int16')
            write(path, sample_rate, audio)
示例#7
0
def inference(args):
    hparams = create_hparams()

    sentences = get_sentences(args)
    # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]

    model = load_model(hparams)
    model.load_state_dict(torch.load(args.checkpoint_path)['state_dict'])
    model.cuda().eval()  #.half()

    test_set = TextMelLoaderEval(sentences, hparams)
    test_collate_fn = TextMelCollateEval(hparams)
    test_sampler = DistributedSampler(
        valset) if hparams.distributed_run else None
    test_loader = DataLoader(test_set,
                             num_workers=0,
                             shuffle=shuffle,
                             sampler=test_sampler,
                             batch_size=hparams.batch_size,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=test_collate_fn)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                batch)

            for j in range(mel_outputs.size(0)):

                mel_decompress = taco_stft.spectral_de_normalize(
                    mel_outputs_postnet[j])
                mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
                spec_from_mel_scaling = 1000
                spec_from_mel = torch.mm(mel_decompress[0],
                                         taco_stft.mel_basis)
                spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
                spec_from_mel = spec_from_mel * spec_from_mel_scaling

                audio = griffin_lim(
                    torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                    taco_stft.stft_fn, args.griffin_iters)

                audio = audio.squeeze()
                audio = audio.cpu().numpy()
                #audio = audio.astype('int16')
                # audio_path = os.path.join('samples', "{}_synthesis.wav".format(args.out_filename))
                audio_path = os.path.join(args.out_filename,
                                          'batch_{}_sentence_{}'.format(i, j))
                write(audio_path, hparams.sampling_rate, audio)
                print(audio_path)
示例#8
0
    def __init__(self):
        super().__init__()
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 16000
        self.hparams.max_decoder_steps = 600

        self.stft = TacotronSTFT(
            self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length,
            self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin,
            self.hparams.mel_fmax)
示例#9
0
 def __init__(self, audio_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
     self.audio_files = files_to_list(audio_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin, mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
示例#10
0
def get_mel(filename, hparams):
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio = load_wav_to_torch(filename, hparams.sampling_rate)
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = torch.squeeze(melspec, 0)
    return melspec
示例#11
0
 def __init__(self, audiopaths_and_text, hparams):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = hparams.text_cleaners
     self.max_wav_value = hparams.max_wav_value
     self.sampling_rate = hparams.sampling_rate
     self.load_mel_from_disk = hparams.load_mel_from_disk
     self.stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                              hparams.win_length, hparams.n_mel_channels,
                              hparams.sampling_rate, hparams.mel_fmin,
                              hparams.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
示例#12
0
def synthesis_griffin_lim(mel,hparams):
    taco_stft = TacotronSTFT(
    hparams.filter_length, hparams.hop_length, hparams.win_length, 
    sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), 
               taco_stft.stft_fn, 60) 
    return waveform
       
示例#13
0
 def __init__(self, ckpt, wglw, n_speakers=123):
     print("[Loading Model]")
     self.ckpt = ckpt
     self.hparams = create_hparams()
     self.hparams.n_speakers = n_speakers
     self.stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              self.hparams.n_mel_channels,
                              self.hparams.sampling_rate,
                              self.hparams.mel_fmin, self.hparams.mel_fmax)
     self.mellotron = load_model(self.hparams).cuda().eval()
     self.waveglow = torch.load(wglw)['model'].cuda().eval()
     self.denoiser = Denoiser(self.waveglow).cuda().eval()
     self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
     self.mellotron.load_state_dict(torch.load(ckpt)['state_dict'])
     print('[Loaded Model]')
示例#14
0
def load_mel(path):
    hparams = create_hparams()
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cpu()
    return melspec
示例#15
0
def main(text, checkpoint_path, path, name):
    #### Setup hparams
    hparams = create_hparams("distributed_run=False,mask_padding=False")
    hparams.filter_length = 1024
    hparams.hop_length = 256
    hparams.win_length = 1024

    #### Load model from checkpoint
    model = get_model(hparams, checkpoint_path)

    #### Prepare text input
    sequence = get_input(get_pinyin(text))

    #### inference
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        sequence, drop_prob=0.25)

    #### tacotron result
    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                           taco_stft.stft_fn, 60)
    write(
        os.path.join(path, name) + '_tacotron.wav', 16000,
        waveform[0].data.cpu().numpy())

    #### transform tacotron mel to wavenet mel
    wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T)

    #### save
    np.save(
        os.path.join(path, name) + '_mel.npy',
        mel_outputs_postnet.data.cpu().numpy()[0])
    np.save(
        os.path.join(path, name) + '_alig.npy',
        alignments.data.cpu().numpy()[0])
    np.save(os.path.join(path, name) + '.npy', wavenet_mel)
示例#16
0
class TextMelLoader(torch.utils.data.Dataset):
    """
        1) loads audio,text pairs
        2) normalizes text and converts them to sequences of one-hot vectors
        3) computes mel-spectrograms from audio files.
    """
    def __init__(self, audiopaths_and_text, hparams):
        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                                 hparams.win_length, hparams.n_mel_channels,
                                 hparams.sampling_rate, hparams.mel_fmin,
                                 hparams.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

    def get_mel_text_pair(self, audiopath_and_text):
        # separate filename and text
        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
        text = self.get_text(text)
        mel = self.get_mel(audiopath)
        return (text, mel)

    def get_mel(self, filename):
        if not self.load_mel_from_disk:
            audio, sampling_rate = load_wav_to_torch(filename)
            if sampling_rate != self.stft.sampling_rate:
                raise ValueError("{} {} SR doesn't match target {} SR".format(
                    sampling_rate, self.stft.sampling_rate))
            audio_norm = audio / self.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
        else:
            melspec = torch.from_numpy(np.load(filename))
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return melspec

    def get_text(self, text):
        text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
        return text_norm

    def __getitem__(self, index):
        return self.get_mel_text_pair(self.audiopaths_and_text[index])

    def __len__(self):
        return len(self.audiopaths_and_text)
示例#17
0
def prepare_training_data(hparams, out_dir, for_wavenet, for_m2m, dataset):
    mel_dir = os.path.join(out_dir, 'mels')
    wav_dir = os.path.join(out_dir, 'audio')
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    metadatas = open(os.path.join(dataset, 'metadata.csv'),
                     'r',
                     encoding='utf-8').readlines()
    audio_paths = []
    sentences = []
    mels = []
    mus = []

    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)

    for i, m in enumerate(metadatas):
        audio_path, sentence = m.strip().split('|')
        audio_path = os.path.join(dataset, 'wavs', audio_path)
        sentences.append(sentence)
        audio_paths.append(audio_path)

        audio = get_audio(audio_path)
        #print(audio.shape, audio.max(), audio.min())
        mel = get_mel(stft, audio)
        mels.append(mel)
        #print(mel.shape, mel.max(), mel.min(), mel.size(0))

        audio = audio.data.cpu().numpy()
        #print(len(audio), hparams.hop_length * mel.size(2))
        diff = len(audio) - hparams.hop_length * mel.size(2)
        if (diff >= 0):

            audio = audio[:-diff]
        else:
            audio = np.append(audio, [0.] * -diff)

        #print(len(audio)%hparams.hop_length ==0, len(audio)//mel.size(2) == hparams.hop_length, len(audio), len(audio)//mel.size(2))

        mu = mulaw_quantize(audio)
        mus.append(mu)
        # print(mu.shape, mu.max(), mu.min())
        if (i % 100 == 0):
            print(i)

    if (for_wavenet):
        save_wavenet_map(out_dir, sentences, mels, mus)
    elif (for_m2m):
        save_m2m_metadata(out_dir, sentences, mels)

    pass
示例#18
0
 def generate(self, text=None):
     text = ch2p(text)
     sequence = np.array(text_to_sequence(text,
                                          ['basic_cleaners']))[None, :]
     sequence = torch.autograd.Variable(
         torch.from_numpy(sequence)).cuda().long()
     mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
         sequence)
     taco_stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              sampling_rate=self.hparams.sampling_rate)
     mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
     spec_from_mel_scaling = 1000
     spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
     spec_from_mel = spec_from_mel * spec_from_mel_scaling
     waveform = griffin_lim(
         torch.autograd.Variable(spec_from_mel[:, :, :-1]),
         taco_stft.stft_fn, 60)
示例#19
0
def run(hparams, output_dir, checkpoint_path, sentence_path, speaker_id,
        trans_con, condition, logvar, clenaer, removing_silence_mel_padding,
        adding_silence_mel_padding, is_GL, is_melout, is_metaout):
    f = open(sentence_path, mode='r', encoding='utf-8-sig')
    sentences = [x.strip() for x in f.readlines()]
    print('All sentences to infer:', sentences)
    f.close()
    os.makedirs(output_dir, exist_ok=True)

    stft = TacotronSTFT(hparams)

    mels = generate_mels(
        hparams,
        checkpoint_path,
        sentences,
        speaker_id,
        trans_con,
        condition,
        logvar,
        clenaer,
        removing_silence_mel_padding,
        adding_silence_mel_padding,
        is_GL,
        output_dir,
    )
    if (is_GL): mels_to_wavs_GL(hparams, mels, stft, output_dir)

    mel_paths = []
    if is_melout:
        mel_dir = os.path.join(output_dir, 'mels')
        os.makedirs(mel_dir, exist_ok=True)

        for i, mel in enumerate(mels):
            mel_path = os.path.join(output_dir, 'mels/',
                                    "mel-{}.npy".format(i))
            mel_paths.append(mel_path)
            if (list(mel.shape)[1] >=
                    hparams.max_decoder_steps - removing_silence_mel_padding):
                continue
            np.save(mel_path, mel)

    if is_metaout:
        with open(os.path.join(output_dir, 'metadata.csv'),
                  'w',
                  encoding='utf-8') as file:
            lines = []
            for i, s in enumerate(sentences):
                mel_path = mel_paths[i]
                if (list(mels[i].shape)[1] >= hparams.max_decoder_steps -
                        removing_silence_mel_padding):
                    continue
                lines.append('{}|{}\n'.format(mel_path, s))
            file.writelines(lines)
示例#20
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
示例#21
0
def infer(checkpoint_path, griffin_iters, text, out_filename):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval()  #.half()

    sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                        taco_stft.stft_fn, griffin_iters)

    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    #audio = audio.astype('int16')
    audio_path = os.path.join('samples',
                              "{}_synthesis.wav".format(out_filename))
    write(audio_path, hparams.sampling_rate, audio)
    print(audio_path)
    plot_alignment_to_numpy(
        alignments.squeeze().cpu().detach().numpy().T,
        os.path.join('samples', "{}_attention.png".format(out_filename)))
示例#22
0
class MelSpectrogramCreator():

    tacotron_stft = TacotronSTFT(hparams.fft_size, hparams.hop_size,
                                 hparams.win_length, hparams.num_mels,
                                 hparams.sample_rate, hparams.fmin,
                                 hparams.fmax)

    @classmethod
    def mel_spectrogram(cls, wav, method):
        if method == 'original':
            mel = audio.logmelspectrogram(wav)
        elif method == 'tacotron':
            wav_tensor = torch.Tensor(wav).unsqueeze(0)
            mel_tensor = cls.tacotron_stft.mel_spectrogram(wav_tensor)
            mel = mel_tensor.squeeze().data.numpy()
        else:
            raise ValueError
        return mel.astype(np.float32).T
示例#23
0
def generate_mels_by_ref_audio(model,
                               waveglow,
                               hparams,
                               sequence,
                               ref_wav,
                               denoiser,
                               denoiser_strength=0.01,
                               device=torch.device('cpu'),
                               *,
                               outpath='output.wav'):
    # Prepare ref audio input
    ref_audio_mel = load_mel(
        ref_wav,
        TacotronSTFT(hparams.filter_length, hparams.hop_length,
                     hparams.win_length, hparams.n_mel_channels,
                     hparams.sampling_rate, hparams.mel_fmin,
                     hparams.mel_fmax), hparams, device)

    # Decode text input and
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference_by_ref_audio(
        sequence, ref_audio_mel)

    # Plot results
    # plot_data('mel.png', plot_spectrogram_to_numpy(mel_outputs.data.cpu().numpy()[0]))

    # Synthesize audio from spectrogram using WaveGlow
    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    write(outpath, hparams.sampling_rate, audio[0].data.cpu().numpy())

    # (Optional) Remove WaveGlow bias
    if denoiser_strength > 0:
        audio_denoised = denoiser(audio, strength=denoiser_strength)[:, 0]
        audio_denoised = audio_denoised * hparams.max_wav_value
        write("denoised_output.wav", hparams.sampling_rate,
              audio_denoised.squeeze().cpu().numpy().astype('int16'))
示例#24
0
def prameter_experiment():

    tmc = [2, 4, 6, 8, 10, 12, 14, 16]  # time masking chunk
    fmc = [2, 4, 6, 8, 10, 12, 14, 16]  # frequency maksing chunk
    # tmn = [(1,8), (2,4), (4,2), (8,1)] # time masking chunk number
    # fmn = [(1,6), (2,3), (3,2), (6,1)] # frequency masking chunk number
    twlr = [2, 4, 6, 8, 10, 12, 14, 16]  # time warping length ratio
    fwl = [2, 4, 6, 8, 10, 12, 14, 16]  #
    tlar = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
    flar = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]  # time length adjust ratio
    var = [2, 4, 8, 16, 32, 64]
    lrtw = [0.4, 0.4, 0.4, 0.4, 0.4, 0.4]  # [0.2, 0.4, 0.8, 1]
    try_nums = np.arange(1, 2)
    hparams = create_hparams()
    stft = TacotronSTFT(hparams)

    # taking filelist about validation data
    with open('./filelists/meta_val.txt', encoding='utf-8-sig') as f:
        files = [x.strip().split('|')[0] for x in f.readlines()]

    # file to mel
    mels = []
    for x in files:
        mel = get_mel(stft, x, hparams, 0).squeeze(0)
        mels.append(mel)
        test(hparams, mel, "./test/test.wav")
        #plot_data(mel, 100)
    # average length of mel
    avg_len = np.average([mel.size(1) for mel in mels])
    print(avg_len)

    # griffin lim
    # os.makedirs('gl', exist_ok=True)
    # for i, mel in enumerate(mels):
    #     path = 'gl' + '/{}.wav'.format(i)
    #     test(hparams, mel, path)

    for try_num in try_nums:

        output_dir = 'try{}'.format(try_num)
        os.makedirs(output_dir, exist_ok=True)

        # making a directory for time warping length rate

        flar_path = output_dir + '/FLAR'
        lrtw_path = output_dir + '/LRTW'

        ## warping part

        # time warping length rate
        # ex = 0
        # for r in lrtw:
        #     dir = lrtw_path + '/{}'.format(r)
        #     os.makedirs(dir, exist_ok=True)
        #     for i, mel in enumerate(mels):
        #         path = dir + '/{}.wav'.format(i)
        #         mel_ = local_random_time_warping(mel, 0.4)
        #         plot_data(mel_, ex)
        #         ex += 1
        #         test(hparams, mel_, path)

        print("--------------------------------------------")
        # for r in twlr:
        #     dir = twlr_path + '/{}'.format(r)
        #     os.makedirs(dir, exist_ok=True)
        #     for i, mel in enumerate(mels):
        #         path = dir + '/{}.wav'.format(i)
        #         mel_ = time_warping(mel, r/100.0)
        #         test(hparams, mel_, path)
        #
        #
        # # frequency warping length
        # for l in fwl:
        #     dir = fwl_path + '/{}'.format(l)
        #     os.makedirs(dir, exist_ok=True)
        #     for i, mel in enumerate(mels):
        #         path = dir + '/{}.wav'.format(i)
        #         mel_ = freq_warping(mel,l)
        #         test(hparams, mel_, path)
        #
        # # time length adjustment rate
        # for r in tlar:
        #     dir = tlar_path + '/{}'.format(r)
        #     os.makedirs(dir, exist_ok=True)
        #     for i, mel in enumerate(mels):
        #         path = dir + '/{}.wav'.format(i)
        #         mel_ = local_random_time_warping(mel, warping_range(r/100.0))
        #         print(mel_.size())
        #         test(hparams, mel_, path)
        #
        ex = 0
        for r in flar:
            dir = flar_path  #+ '/{}'.format(r)
            os.makedirs(dir, exist_ok=True)
            for i, mel in enumerate(mels):
                path = dir + '/{}.wav'.format(ex)
                mel_ = local_random_freq_warping(mel, r)
                plot_data(mel_, ex)
                ex += 1
                test(hparams, mel_, path)
示例#25
0
import librosa
import torch
from torch.utils.data import DataLoader
from model import parse_batch
from configs.two_way_0730 import create_hparams
from train import initiate_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

hparams = create_hparams()
hparams.batch_size = 1
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                    hparams.win_length, hparams.n_mel_channels,
                    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax)
speaker = "nes"
checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000'
model = initiate_model(hparams).cuda().eval()
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
test_text_path = 'filelists/emotion/neutral2.txt'
test_set = TextMelLoader(test_text_path, hparams)
datacollate = TextMelCollate(1)
dataloader = DataLoader(test_set,
                        num_workers=1,
                        shuffle=False,
示例#26
0
        id = os.path.splitext(id)[0]

        clean_char = custom_english_cleaners(text.rstrip())
        clean_phone = []
        for s in g2p(clean_char.lower()):
            if '@' + s in symbol_to_id:
                clean_phone.append('@' + s)
            else:
                clean_phone.append(s)

        metadata[id] = {'char': clean_char, 'phone': clean_phone}

stft = TacotronSTFT(filter_length=1024,
                    hop_length=256,
                    win_length=1024,
                    n_mel_channels=80,
                    sampling_rate=16000,
                    mel_fmin=55.0,
                    mel_fmax=7600.0)


def text2seq(text):
    sequence = [symbol_to_id['^']]
    sequence.extend([symbol_to_id[c] for c in text])
    sequence.append(symbol_to_id['~'])
    return sequence


def get_mel(filename):
    wav, sr = librosa.load(filename, sr=16000)
    wav, _ = librosa.effects.trim(wav,
示例#27
0
def measure(output_directory, log_directory, checkpoint_path, warm_start,
            n_gpus, rank, group_name, hparams):
    """Handles all the validation scoring and printing"""
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)

    mellotron = load_model(hparams).cuda().eval()
    mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    waveglow_path = '/media/arsh/New Volume/Models/speech/waveglow_256channels_v4.pt'
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    denoiser = Denoiser(waveglow).cuda().eval()

    arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
    audio_paths = 'filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt'
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(1)

    speaker_ids = TextMelLoader(
        "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt",
        hparams).speaker_ids
    speakers = pd.read_csv('filelists/libritts_speakerinfo.txt',
                           engine='python',
                           header=None,
                           comment=';',
                           sep=' *\| *',
                           names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
    speakers['MELLOTRON_ID'] = speakers['ID'].apply(
        lambda x: speaker_ids[x] if x in speaker_ids else -1)
    female_speakers = cycle(
        speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())
    male_speakers = cycle(
        speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())

    file_idx = 0
    MEL_DTW = []
    TPP_DTW = []
    RAND_DTW = []
    logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
    while file_idx < len(dataloader):
        audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

        # get audio path, encoded text, pitch contour and mel for gst
        text_encoded = torch.LongTensor(
            text_to_sequence(text, hparams.text_cleaners,
                             arpabet_dict))[None, :].cuda()
        pitch_contour = dataloader[file_idx][3][None].cuda()
        mel = load_mel(audio_path, stft)
        fs, audio = read(audio_path)

        # load source data to obtain rhythm using tacotron 2 as a forced aligner
        x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

        with torch.no_grad():
            # get rhythm (alignment map) using tacotron 2
            mel_outputs, mel_outputs_postnet, gate_outputs, rhythm, gst, tpse_gst = mellotron.forward(
                x)
            rhythm = rhythm.permute(1, 0, 2)
        speaker_id = next(female_speakers) if np.random.randint(2) else next(
            male_speakers)
        speaker_id = torch.LongTensor([speaker_id]).cuda()

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour, rhythm),
                with_tpse=False)
        with torch.no_grad():
            audio_mel = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, mel, speaker_id, pitch_contour, rhythm),
                with_tpse=True)
        with torch.no_grad():
            audio_tpp = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
                (text_encoded, np.random.randint(
                    0, 9), speaker_id, pitch_contour, rhythm),
                with_tpse=False)
        with torch.no_grad():
            audio_rand = denoiser(
                waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
        audio = np.pad(audio, 128)

        MEL_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_mel.data.cpu().numpy(), audio, eucCepDist)[0]))
        TPP_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_tpp.data.cpu().numpy(), audio, eucCepDist)[0]))
        RAND_DTW.append(
            logSpecDbConst *
            np.log(dtw(audio_rand.data.cpu().numpy(), audio, eucCepDist)[0]))
        print(MEL_DTW[-1], TPP_DTW[-1], RAND_DTW[-1])
        print("MEL DTW, Mean: ", np.mean(MEL_DTW), " SD: ", np.std(MEL_DTW))
        print("TPP DTW, Mean: ", np.mean(TPP_DTW), " SD: ", np.std(TPP_DTW))
        print("RAND DTW, Mean: ", np.mean(RAND_DTW), " SD: ", np.std(RAND_DTW))
        file_idx += 1
示例#28
0
def infer(output_directory, checkpoint_path, warm_start, hparams, debug=False):
    """Inference with teaching force

    Params
    ------
    output_directory (string): directory to the spectrograms
    checkpoint_path(string): checkpoint path
    hparams (object): comma separated list of "name=value" pairs.
    """

    os.makedirs(output_directory, exist_ok=True)
    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    return_file_name = True

    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             return_file_name=return_file_name)
    collate_fn = TextMelCollate(hparams.n_frames_per_step,
                                return_file_name=return_file_name)

    train_sampler = None

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              collate_fn=collate_fn)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.eval()

    for i, batch in enumerate(train_loader):
        x, y = model.parse_batch(batch[:][:-1])
        files_name = batch[:][-1]
        mel_outputs, mel_outputs_postnet, _, alignments = model(x)

        _, _, mel_expected_padded, _, mel_lengths = x

        for idx in range(mel_outputs_postnet.size(0)):

            name = os.path.basename(files_name[idx]).replace(".wav", '')
            mel_padded = mel_outputs_postnet[idx]
            mel_length = mel_lengths[idx]
            mel = mel_padded[:, :mel_length]
            np.save(os.path.join(output_directory, name + '.npy'),
                    mel.detach().cpu().numpy())

            if debug:
                print(
                    "Debug Mode ON: Saving Wave files and Spectrograms Plot in:",
                    output_directory)
                # plot audios
                librosa.output.write_wav(
                    os.path.join(output_directory, name + '.wav'),
                    spec_to_waveform(taco_stft, mel).detach().cpu().numpy(),
                    sr=hparams.sampling_rate)
                librosa.output.write_wav(
                    os.path.join(output_directory, name + '_padded.wav'),
                    spec_to_waveform(taco_stft,
                                     mel_padded).detach().cpu().numpy(),
                    sr=hparams.sampling_rate)
                librosa.output.write_wav(
                    os.path.join(output_directory,
                                 name + '_expected_padded.wav'),
                    spec_to_waveform(
                        taco_stft,
                        mel_expected_padded[idx]).detach().cpu().numpy(),
                    sr=hparams.sampling_rate)
                # plot figures
                plot_spectrogram(mel.detach().cpu().numpy(), )
                plot_spectrogram(
                    mel_padded.detach().cpu().numpy(),
                    os.path.join(output_directory, name + '_padded.png'))
                plot_spectrogram(
                    mel_expected_padded[idx].detach().cpu().numpy(),
                    os.path.join(output_directory,
                                 name + '_expect_padded.png'))
示例#29
0
def taco_stft():
    n_fft, hop_length, win_length = _stft_parameters()
    stft = TacotronSTFT(n_fft, hop_length, win_length)
    return stft
示例#30
0
class Synthesizer(object):
    def __init__(self):
        super().__init__()
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 16000
        self.hparams.max_decoder_steps = 600

        self.stft = TacotronSTFT(
            self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length,
            self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin,
            self.hparams.mel_fmax)

    def load_mel(self, path):
        audio, sampling_rate = load_wav_to_torch(path)
        if sampling_rate != self.hparams.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.stft.sampling_rate))
        audio_norm = audio / self.hparams.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = melspec.cuda()
        return melspec

    # def close(self):
    #     tf.reset_default_graph()
    #     self.sess.close()

    def load(self, checkpoint_path, waveglow_path):
        self.model = load_model(self.hparams)
        self.model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
        _ = self.model.eval()

        self.waveglow = torch.load(waveglow_path)['model']
        self.waveglow.cuda()

        path = './web/static/uploads/koemo_spk_emo_all_test.txt'
        with open(path, encoding='utf-8') as f:
            filepaths_and_text = [line.strip().split("|") for line in f]
        
        base_path = os.path.dirname(checkpoint_path)
        data_path = os.path.basename(checkpoint_path) + '_' + path.rsplit('_', 1)[1].split('.')[0] + '.npz'
        npz_path = os.path.join(base_path, data_path)
        
        if os.path.exists(npz_path):
            d = np.load(npz_path)
            zs = d['zs']
            emotions = d['emotions']
        else:
            emotions = []
            zs = []
            for audio_path, _, _, emotion in tqdm(filepaths_and_text):
                melspec = self.load_mel(audio_path)
                _, _, _, z = self.model.vae_gst(melspec)
                zs.append(z.cpu().data)
                emotions.append(int(emotion))
            emotions = np.array(emotions) # list이면 안됨 -> ndarray
            zs = torch.cat(zs, dim=0).data.numpy()
            d = {'zs':zs, 'emotions':emotions}
            np.savez(npz_path, **d)

        self.neu = np.mean(zs[emotions==0,:], axis=0)
        self.sad = np.mean(zs[emotions==1,:], axis=0)
        self.ang = np.mean(zs[emotions==2,:], axis=0)
        self.hap = np.mean(zs[emotions==3,:], axis=0)

    def synthesize(self, text, path, condition_on_ref, ref_audio, ratios):
        print(ratios)
        sequence = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
        inputs = self.model.parse_input(sequence)
        transcript_embedded_inputs = self.model.transcript_embedding(inputs).transpose(1,2)
        transcript_outputs = self.model.encoder.inference(transcript_embedded_inputs)
        print(condition_on_ref)

        if condition_on_ref:
            #ref_audio = '/data1/jinhan/KoreanEmotionSpeech/wav/hap/hap_00000001.wav'
            ref_audio_mel = self.load_mel(ref_audio)
            latent_vector, _, _, _ = self.model.vae_gst(ref_audio_mel)
            latent_vector = latent_vector.unsqueeze(1).expand_as(transcript_outputs)
        
        else: # condition on emotion ratio
            latent_vector = ratios[0] * self.neu + ratios[1] * self.sad + \
                        ratios[2] * self.hap + ratios[3] * self.ang
            latent_vector = torch.FloatTensor(latent_vector).cuda()
            latent_vector = self.model.vae_gst.fc3(latent_vector)

        encoder_outputs = transcript_outputs + latent_vector

        decoder_input = self.model.decoder.get_go_frame(encoder_outputs)
        self.model.decoder.initialize_decoder_states(encoder_outputs, mask=None)
        mel_outputs, gate_outputs, alignments = [], [], []

        while True:
            decoder_input = self.model.decoder.prenet(decoder_input)
            mel_output, gate_output, alignment = self.model.decoder.decode(decoder_input)

            mel_outputs += [mel_output]
            gate_outputs += [gate_output]
            alignments += [alignment]

            if torch.sigmoid(gate_output.data) > self.hparams.gate_threshold:
                # print(torch.sigmoid(gate_output.data), gate_output.data)
                break
            if len(mel_outputs) == self.hparams.max_decoder_steps:
                print("Warning! Reached max decoder steps")
                break

            decoder_input = mel_output

        mel_outputs, gate_outputs, alignments = self.model.decoder.parse_decoder_outputs(
                mel_outputs, gate_outputs, alignments)
        mel_outputs_postnet = self.model.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
        # print(mel_outputs_postnet.shape)

        with torch.no_grad():
            synth = self.waveglow.infer(mel_outputs, sigma=0.666)
        
        # return synth[0].data.cpu().numpy()
        # path = add_postfix(path, idx)
        # print(path)
        librosa.output.write_wav(path, synth[0].data.cpu().numpy(), 16000)