示例#1
0
def synthesis(m, s, CONFIG, use_cuda, ap, language=None):
    """ Given the text, synthesising the audio """
    if language is None:
        language = CONFIG.phoneme_language
    text_cleaner = [CONFIG.text_cleaner]
    # print(phoneme_to_sequence(s, text_cleaner))
    # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
    if CONFIG.use_phonemes:
        seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language),
                         dtype=np.int32)
    else:
        seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32)
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    mel_spec, linear_spec, alignments, stop_tokens = m.forward(
        chars_var.long())
    linear_spec = linear_spec[0].data.cpu().numpy()
    mel_spec = mel_spec[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    mel_tensor = torch.FloatTensor(mel_spec.T).unsqueeze(0)
    if torch.cuda.is_available():
        mel_tensor = mel_tensor.cuda()
    wav = wavernn.generate(mel_tensor, batched=True, target=11000, overlap=550)
    return wav
示例#2
0
def create_speech(m, s, CONFIG, use_cuda, ap):
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(s, text_cleaner))

    #     mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)

    if use_cuda:
        chars_var = torch.autograd.Variable(torch.from_numpy(seq),
                                            volatile=True).unsqueeze(0).cuda()
#         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
    else:
        chars_var = torch.autograd.Variable(torch.from_numpy(seq),
                                            volatile=True).unsqueeze(0)


#         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)

    mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var)
    linear_out = linear_out[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    spec = ap._denormalize(linear_out)
    wav = ap.inv_spectrogram(linear_out.T)
    wav = wav[:ap.find_endpoint(wav)]
    out = io.BytesIO()
    ap.save_wav(wav, out)
    return wav, alignment, spec, stop_tokens
示例#3
0
 def __getitem__(self, idx):
     wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav'
     text = self.frames[idx][1]
     text = np.asarray(text_to_sequence(text, [self.cleaners]),
                       dtype=np.int32)
     wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
     sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
     return sample
示例#4
0
def synthesis(model, ap, text, use_cuda, text_cleaner):
    text_cleaner = [text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda().long()
    _, linear_out, alignments, _ = model.forward(chars_var)
    linear_out = linear_out[0].data.cpu().numpy()
    wav = ap.inv_spectrogram(linear_out.T)
    return wav, linear_out, alignments
示例#5
0
def create_speech(m, s, CONFIG, use_cuda, ap):
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(s, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var)
    linear_out = linear_out[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    spec = ap._denormalize(linear_out)
    wav = ap.inv_spectrogram(linear_out.T)
    wav = wav[:ap.find_endpoint(wav)]
    out = io.BytesIO()
    ap.save_wav(wav, out)
    return wav, alignment, spec, stop_tokens
def tts(text,
        model_path='model/best_model.pth.tar',
        config_path='model/config.json',
        use_cuda=False):
    CONFIG = load_config(config_path)
    model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels,
                     CONFIG.r)
    if use_cuda:
        cp = torch.load(model_path + seq_to_seq_test_model_fname,
                        map_location='cuda:0')
    else:
        cp = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()
    model.decoder.max_decoder_steps = 250
    ap = AudioProcessor(CONFIG.sample_rate,
                        CONFIG.num_mels,
                        CONFIG.min_level_db,
                        CONFIG.frame_shift_ms,
                        CONFIG.frame_length_ms,
                        CONFIG.ref_level_db,
                        CONFIG.num_freq,
                        CONFIG.power,
                        CONFIG.preemphasis,
                        griffin_lim_iters=50)
    t_1 = time.time()
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    linear_out = model.forward(chars_var.long())
    linear_out = linear_out[0].data.cpu().numpy()
    waveform = ap.inv_spectrogram(linear_out.T)
    waveform = waveform[:ap.find_endpoint(waveform)]
    out_path = 'static/samples/'
    os.makedirs(out_path, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(out_path, file_name)
    ap.save_wav(waveform, out_path)
    # print(" >  Run-time: {}".format(time.time() - t_1))

    return file_name
示例#7
0
    def load_data(self, idx):
        text, wav_file, speaker_name = self.items[idx]
        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)

        if self.use_phonemes:
            text = self._load_or_generate_phoneme_sequence(wav_file, text)
        else:
            text = np.asarray(text_to_sequence(text, [self.cleaners]),
                              dtype=np.int32)
        assert text.size > 0, self.items[idx][1]
        assert wav.size > 0, self.items[idx][1]

        sample = {
            'text': text,
            'wav': wav,
            'item_idx': self.items[idx][1],
            'speaker_name': speaker_name
        }
        return sample
示例#8
0
 def tts(self, text):
     text_cleaner = [self.config.text_cleaner]
     wavs = []
     for sen in text.split('.'):
         if len(sen) < 3:
             continue
         sen += '.'
         print(sen)
         sen = sen.strip()
         seq = np.array(text_to_sequence(text, text_cleaner))
         chars_var = torch.from_numpy(seq).unsqueeze(0)
         if self.use_cuda:
             chars_var = chars_var.cuda()
         mel_out, linear_out, alignments, stop_tokens = self.model.forward(
             chars_var)
         linear_out = linear_out[0].data.cpu().numpy()
         wav = self.ap.inv_spectrogram(linear_out.T)
         # wav = wav[:self.ap.find_endpoint(wav)]
         out = io.BytesIO()
         wavs.append(wav)
         wavs.append(np.zeros(10000))
     self.save_wav(wav, out)
     return out
def synthesis(m, s, CONFIG, use_cuda, ap, language=None):
    """ Given the text, synthesising the audio """
    if language is None:
        language = CONFIG.phoneme_language
    text_cleaner = [CONFIG.text_cleaner]
    # print(phoneme_to_sequence(s, text_cleaner))
    # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
    if CONFIG.use_phonemes:
        seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language),
                         dtype=np.int32)
    else:
        seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32)
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    mel_spec, linear_spec, alignments, stop_tokens = m.forward(
        chars_var.long())
    linear_spec = linear_spec[0].data.cpu().numpy()
    mel_spec = mel_spec[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    wav = ap.inv_spectrogram(linear_spec.T)
    wav = wav[:ap.find_endpoint(wav)]
    return wav