示例#1
0
    def say(self, text, output):
        # load the model
        model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                         self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                            self.CONFIG.min_level_db,
                            self.CONFIG.frame_shift_ms,
                            self.CONFIG.frame_length_ms,
                            self.CONFIG.ref_level_db, self.CONFIG.num_freq,
                            self.CONFIG.power, self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(self.MODEL_PATH)
        else:
            cp = torch.load(self.MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if self.use_cuda:
            model.cuda()
        model.eval()

        model.decoder.max_decoder_steps = 400
        wavs = self.text2audio(text, model, self.CONFIG, self.use_cuda, ap)

        audio = np.concatenate(wavs)
        ap.save_wav(audio, output)

        return
def tts(text,
        model_path='model/best_model.pth.tar',
        config_path='model/config.json',
        use_cuda=False):
    CONFIG = load_config(config_path)
    model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels,
                     CONFIG.r)
    if use_cuda:
        cp = torch.load(model_path + seq_to_seq_test_model_fname,
                        map_location='cuda:0')
    else:
        cp = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()
    model.decoder.max_decoder_steps = 250
    ap = AudioProcessor(CONFIG.sample_rate,
                        CONFIG.num_mels,
                        CONFIG.min_level_db,
                        CONFIG.frame_shift_ms,
                        CONFIG.frame_length_ms,
                        CONFIG.ref_level_db,
                        CONFIG.num_freq,
                        CONFIG.power,
                        CONFIG.preemphasis,
                        griffin_lim_iters=50)
    t_1 = time.time()
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    linear_out = model.forward(chars_var.long())
    linear_out = linear_out[0].data.cpu().numpy()
    waveform = ap.inv_spectrogram(linear_out.T)
    waveform = waveform[:ap.find_endpoint(waveform)]
    out_path = 'static/samples/'
    os.makedirs(out_path, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(out_path, file_name)
    ap.save_wav(waveform, out_path)
    # print(" >  Run-time: {}".format(time.time() - t_1))

    return file_name
示例#3
0
class TestTTSDataset(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.max_loader_iter = 4
        self.ap = AudioProcessor(**c.audio)

    def _create_dataloader(self, batch_size, r, bgs):
        items = ljspeech(c.data_path, "metadata.csv")
        dataset = TTSDataset.MyDataset(
            r,
            c.text_cleaner,
            compute_linear_spec=True,
            ap=self.ap,
            meta_data=items,
            tp=c.characters,
            batch_group_size=bgs,
            min_seq_len=c.min_seq_len,
            max_seq_len=float("inf"),
            use_phonemes=False,
        )
        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=dataset.collate_fn,
            drop_last=True,
            num_workers=c.num_loader_workers,
        )
        return dataloader, dataset

    def test_loader(self):
        if ok_ljspeech:
            dataloader, dataset = self._create_dataloader(2, c.r, 0)

            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                text_input = data[0]
                text_lengths = data[1]
                speaker_name = data[2]
                linear_input = data[3]
                mel_input = data[4]
                mel_lengths = data[5]
                stop_target = data[6]
                item_idx = data[7]

                neg_values = text_input[text_input < 0]
                check_count = len(neg_values)
                assert check_count == 0, " !! Negative values in text_input: {}".format(
                    check_count)
                # TODO: more assertion here
                assert isinstance(speaker_name[0], str)
                assert linear_input.shape[0] == c.batch_size
                assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
                assert mel_input.shape[0] == c.batch_size
                assert mel_input.shape[2] == c.audio["num_mels"]
                # check normalization ranges
                if self.ap.symmetric_norm:
                    assert mel_input.max() <= self.ap.max_norm
                    assert mel_input.min() >= -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
                    assert mel_input.min() < 0
                else:
                    assert mel_input.max() <= self.ap.max_norm
                    assert mel_input.min() >= 0

    def test_batch_group_shuffle(self):
        if ok_ljspeech:
            dataloader, dataset = self._create_dataloader(2, c.r, 16)
            last_length = 0
            frames = dataset.items
            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                text_input = data[0]
                text_lengths = data[1]
                speaker_name = data[2]
                linear_input = data[3]
                mel_input = data[4]
                mel_lengths = data[5]
                stop_target = data[6]
                item_idx = data[7]

                avg_length = mel_lengths.numpy().mean()
                assert avg_length >= last_length
            dataloader.dataset.sort_items()
            is_items_reordered = False
            for idx, item in enumerate(dataloader.dataset.items):
                if item != frames[idx]:
                    is_items_reordered = True
                    break
            assert is_items_reordered

    def test_padding_and_spec(self):
        if ok_ljspeech:
            dataloader, dataset = self._create_dataloader(1, 1, 0)

            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                text_input = data[0]
                text_lengths = data[1]
                speaker_name = data[2]
                linear_input = data[3]
                mel_input = data[4]
                mel_lengths = data[5]
                stop_target = data[6]
                item_idx = data[7]

                # check mel_spec consistency
                wav = np.asarray(self.ap.load_wav(item_idx[0]),
                                 dtype=np.float32)
                mel = self.ap.melspectrogram(wav).astype("float32")
                mel = torch.FloatTensor(mel).contiguous()
                mel_dl = mel_input[0]
                # NOTE: Below needs to check == 0 but due to an unknown reason
                # there is a slight difference between two matrices.
                # TODO: Check this assert cond more in detail.
                assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T -
                                                             mel_dl).max()

                # check mel-spec correctness
                mel_spec = mel_input[0].cpu().numpy()
                wav = self.ap.inv_melspectrogram(mel_spec.T)
                self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
                shutil.copy(item_idx[0],
                            OUTPATH + "/mel_target_dataloader.wav")

                # check linear-spec
                linear_spec = linear_input[0].cpu().numpy()
                wav = self.ap.inv_spectrogram(linear_spec.T)
                self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
                shutil.copy(item_idx[0],
                            OUTPATH + "/linear_target_dataloader.wav")

                # check the last time step to be zero padded
                assert linear_input[0, -1].sum() != 0
                assert linear_input[0, -2].sum() != 0
                assert mel_input[0, -1].sum() != 0
                assert mel_input[0, -2].sum() != 0
                assert stop_target[0, -1] == 1
                assert stop_target[0, -2] == 0
                assert stop_target.sum() == 1
                assert len(mel_lengths.shape) == 1
                assert mel_lengths[0] == linear_input[0].shape[0]
                assert mel_lengths[0] == mel_input[0].shape[0]

            # Test for batch size 2
            dataloader, dataset = self._create_dataloader(2, 1, 0)

            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                text_input = data[0]
                text_lengths = data[1]
                speaker_name = data[2]
                linear_input = data[3]
                mel_input = data[4]
                mel_lengths = data[5]
                stop_target = data[6]
                item_idx = data[7]

                if mel_lengths[0] > mel_lengths[1]:
                    idx = 0
                else:
                    idx = 1

                # check the first item in the batch
                assert linear_input[idx, -1].sum() != 0
                assert linear_input[idx, -2].sum() != 0, linear_input
                assert mel_input[idx, -1].sum() != 0
                assert mel_input[idx, -2].sum() != 0, mel_input
                assert stop_target[idx, -1] == 1
                assert stop_target[idx, -2] == 0
                assert stop_target[idx].sum() == 1
                assert len(mel_lengths.shape) == 1
                assert mel_lengths[idx] == mel_input[idx].shape[0]
                assert mel_lengths[idx] == linear_input[idx].shape[0]

                # check the second itme in the batch
                assert linear_input[1 - idx, -1].sum() == 0
                assert mel_input[1 - idx, -1].sum() == 0
                assert stop_target[1, mel_lengths[1] - 1] == 1
                assert stop_target[1, mel_lengths[1]:].sum() == 0
                assert len(mel_lengths.shape) == 1
示例#4
0
class Synthesizer(object):
    def __init__(self, config):
        self.wavernn = None
        self.pwgan = None
        self.config = config
        self.use_cuda = self.config.use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                      self.config.use_cuda)
        if self.config.vocoder_checkpoint:
            self.load_vocoder(self.config.vocoder_checkpoint,
                              self.config.vocoder_config, self.config.use_cuda)
        if self.config.wavernn_lib_path:
            self.load_wavernn(self.config.wavernn_lib_path,
                              self.config.wavernn_file,
                              self.config.wavernn_config, self.config.use_cuda)
        if self.config.pwgan_file:
            self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file,
                            self.config.pwgan_config, self.config.use_cuda)

    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # TODO: fix this for multi-speaker model - load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
            print(f" > model reduction factor: {cp['r']}")

    def load_vocoder(self, model_file, model_config, use_cuda):
        self.vocoder_config = load_config(model_config)
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_state_dict(
            torch.load(model_file, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0
        self.vocoder_config = load_config(model_config)

        if use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()

    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(
            lib_path)  # set this if WaveRNN is not installed globally
        #pylint: disable=import-outside-toplevel
        from WaveRNN.models.wavernn import Model
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(model_config)
        # This is the default architecture we use for our models.
        # You might need to update it
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file, map_location="cpu")
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()

    def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
        if lib_path:
            # set this if ParallelWaveGAN is not installed globally
            sys.path.append(lib_path)
        try:
            #pylint: disable=import-outside-toplevel
            from parallel_wavegan.models import ParallelWaveGANGenerator
        except ImportError as e:
            raise RuntimeError(
                f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}"
            )
        print(" > Loading PWGAN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        with open(model_config) as f:
            self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
        self.pwgan = ParallelWaveGANGenerator(
            **self.pwgan_config["generator_params"])
        self.pwgan.load_state_dict(
            torch.load(model_file, map_location="cpu")["model"]["generator"])
        self.pwgan.remove_weight_norm()
        if use_cuda:
            self.pwgan.cuda()
        self.pwgan.eval()

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    @staticmethod
    def split_into_sentences(text):
        text = " " + text + "  <stop>"
        text = text.replace("\n", " ")
        text = re.sub(prefixes, "\\1<prd>", text)
        text = re.sub(websites, "<prd>\\1", text)
        if "Ph.D" in text:
            text = text.replace("Ph.D.", "Ph<prd>D<prd>")
        text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text)
        text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
        text = re.sub(
            alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
            "\\1<prd>\\2<prd>\\3<prd>", text)
        text = re.sub(alphabets + "[.]" + alphabets + "[.]",
                      "\\1<prd>\\2<prd>", text)
        text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2",
                      text)
        text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
        text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
        if "”" in text:
            text = text.replace(".”", "”.")
        if "\"" in text:
            text = text.replace(".\"", "\".")
        if "!" in text:
            text = text.replace("!\"", "\"!")
        if "?" in text:
            text = text.replace("?\"", "\"?")
        text = text.replace(".", ".<stop>")
        text = text.replace("?", "?<stop>")
        text = text.replace("!", "!<stop>")
        text = text.replace("<prd>", ".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = list(filter(
            None, [s.strip() for s in sentences]))  # remove empty sentences
        return sentences

    def tts(self, text, speaker_id=None):
        start_time = time.time()
        wavs = []
        sens = self.split_into_sentences(text)
        print(sens)
        speaker_id = id_to_torch(speaker_id)
        if speaker_id is not None and self.use_cuda:
            speaker_id = speaker_id.cuda()

        for sen in sens:
            # preprocess the given text
            inputs = text_to_seqvec(sen, self.tts_config)
            inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
            inputs = inputs.unsqueeze(0)
            # synthesize voice
            decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
                self.tts_model, inputs, self.tts_config, False, speaker_id,
                None)
            # convert outputs to numpy
            if self.vocoder_model:
                vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
                wav = self.vocoder_model.inference(vocoder_input)
                if self.use_cuda:
                    wav = wav.cpu().numpy()
                else:
                    wav = wav.numpy()
                wav = wav.flatten()
            elif self.wavernn:
                vocoder_input = None
                if self.tts_config.model == "Tacotron":
                    vocoder_input = torch.FloatTensor(
                        self.ap.out_linear_to_mel(
                            linear_spec=postnet_output.T).T).T.unsqueeze(0)
                else:
                    vocoder_input = postnet_output[0].transpose(0,
                                                                1).unsqueeze(0)
                if self.use_cuda:
                    vocoder_input.cuda()
                wav = self.wavernn.generate(
                    vocoder_input,
                    batched=self.config.is_wavernn_batched,
                    target=11000,
                    overlap=550)
            # trim silence
            wav = trim_silence(wav, self.ap)

            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)

        # compute stats
        process_time = time.time() - start_time
        audio_time = len(wavs) / self.tts_config.audio['sample_rate']
        print(f" > Processing time: {process_time}")
        print(f" > Real-time factor: {process_time / audio_time}")
        return out
示例#5
0
class TestAudio(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ap = AudioProcessor(**conf)

    def test_audio_synthesis(self):
        """1. load wav
        2. set normalization parameters
        3. extract mel-spec
        4. invert to wav and save the output
        """
        print(" > Sanity check for the process wav -> mel -> wav")

        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
            self.ap.max_norm = max_norm
            self.ap.signal_norm = signal_norm
            self.ap.symmetric_norm = symmetric_norm
            self.ap.clip_norm = clip_norm
            wav = self.ap.load_wav(WAV_FILE)
            mel = self.ap.melspectrogram(wav)
            wav_ = self.ap.inv_melspectrogram(mel)
            file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format(
                max_norm, signal_norm, symmetric_norm, clip_norm
            )
            print(" | > Creating wav file at : ", file_name)
            self.ap.save_wav(wav_, OUT_PATH + file_name)

        # maxnorm = 1.0
        _test(1.0, False, False, False)
        _test(1.0, True, False, False)
        _test(1.0, True, True, False)
        _test(1.0, True, False, True)
        _test(1.0, True, True, True)
        # maxnorm = 4.0
        _test(4.0, False, False, False)
        _test(4.0, True, False, False)
        _test(4.0, True, True, False)
        _test(4.0, True, False, True)
        _test(4.0, True, True, True)

    def test_normalize(self):
        """Check normalization and denormalization for range values and consistency"""
        print(" > Testing normalization and denormalization.")
        wav = self.ap.load_wav(WAV_FILE)
        wav = self.ap.sound_norm(wav)  # normalize audio to get abetter normalization range below.
        self.ap.signal_norm = False
        x = self.ap.melspectrogram(wav)
        x_old = x

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.clip_norm = False
        self.ap.max_norm = 4.0
        x_norm = self.ap.normalize(x)
        print(
            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
        )
        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
        assert x_norm.min() >= 0 - 1, x_norm.min()
        # check denorm.
        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.clip_norm = True
        self.ap.max_norm = 4.0
        x_norm = self.ap.normalize(x)
        print(
            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
        )

        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= 0, x_norm.min()
        # check denorm.
        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.clip_norm = False
        self.ap.max_norm = 4.0
        x_norm = self.ap.normalize(x)
        print(
            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
        )

        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()  # pylint: disable=invalid-unary-operand-type
        assert x_norm.min() <= 0, x_norm.min()
        # check denorm.
        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.clip_norm = True
        self.ap.max_norm = 4.0
        x_norm = self.ap.normalize(x)
        print(
            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
        )

        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
        assert x_norm.min() <= 0, x_norm.min()
        # check denorm.
        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.max_norm = 1.0
        x_norm = self.ap.normalize(x)
        print(
            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
        )

        assert (x_old - x).sum() == 0
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= 0, x_norm.min()
        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.max_norm = 1.0
        x_norm = self.ap.normalize(x)
        print(
            f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} --  {x_norm.min()}"
        )

        assert (x_old - x).sum() == 0
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()  # pylint: disable=invalid-unary-operand-type
        assert x_norm.min() < 0, x_norm.min()
        x_ = self.ap.denormalize(x_norm)
        assert (x - x_).sum() < 1e-3

    def test_scaler(self):
        scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
        conf.stats_path = scaler_stats_path
        conf.preemphasis = 0.0
        conf.do_trim_silence = True
        conf.signal_norm = True

        ap = AudioProcessor(**conf)
        mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
        ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)

        self.ap.signal_norm = False
        self.ap.preemphasis = 0.0

        # test scaler forward and backward transforms
        wav = self.ap.load_wav(WAV_FILE)
        mel_reference = self.ap.melspectrogram(wav)
        mel_norm = ap.melspectrogram(wav)
        mel_denorm = ap.denormalize(mel_norm)
        assert abs(mel_reference - mel_denorm).max() < 1e-4

    def test_compute_f0(self):  # pylint: disable=no-self-use
        ap = AudioProcessor(**conf)
        wav = ap.load_wav(WAV_FILE)
        pitch = ap.compute_f0(wav)
        mel = ap.melspectrogram(wav)
        assert pitch.shape[0] == mel.shape[1]
示例#6
0
class TestTTSDataset(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.max_loader_iter = 4
        self.ap = AudioProcessor(**c.audio)

    def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):

        # load dataset
        meta_data_train, meta_data_eval = load_tts_samples(dataset_config,
                                                           eval_split=True,
                                                           eval_split_size=0.2)
        items = meta_data_train + meta_data_eval

        tokenizer, _ = TTSTokenizer.init_from_config(c)
        dataset = TTSDataset(
            outputs_per_step=r,
            compute_linear_spec=True,
            return_wav=True,
            tokenizer=tokenizer,
            ap=self.ap,
            samples=items,
            batch_group_size=bgs,
            min_text_len=c.min_text_len,
            max_text_len=c.max_text_len,
            min_audio_len=c.min_audio_len,
            max_audio_len=c.max_audio_len,
            start_by_longest=start_by_longest,
        )
        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=dataset.collate_fn,
            drop_last=True,
            num_workers=c.num_loader_workers,
        )
        return dataloader, dataset

    def test_loader(self):
        if ok_ljspeech:
            dataloader, dataset = self._create_dataloader(1, 1, 0)

            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                text_input = data["token_id"]
                _ = data["token_id_lengths"]
                speaker_name = data["speaker_names"]
                linear_input = data["linear"]
                mel_input = data["mel"]
                mel_lengths = data["mel_lengths"]
                _ = data["stop_targets"]
                _ = data["item_idxs"]
                wavs = data["waveform"]

                neg_values = text_input[text_input < 0]
                check_count = len(neg_values)

                # check basic conditions
                self.assertEqual(check_count, 0)
                self.assertEqual(linear_input.shape[0], mel_input.shape[0],
                                 c.batch_size)
                self.assertEqual(linear_input.shape[2],
                                 self.ap.fft_size // 2 + 1)
                self.assertEqual(mel_input.shape[2], c.audio["num_mels"])
                self.assertEqual(wavs.shape[1],
                                 mel_input.shape[1] * c.audio.hop_length)
                self.assertIsInstance(speaker_name[0], str)

                # make sure that the computed mels and the waveform match and correctly computed
                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
                # remove padding in mel-spectrogram
                mel_dataloader = mel_input[0].T.numpy()[:, :mel_lengths[0]]
                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
                mel_new = mel_new[:, :mel_lengths[0]]
                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
                mel_diff = (mel_new[:, :mel_input.shape[1]] -
                            mel_input[0].T.numpy())[:, 0:ignore_seg]
                self.assertLess(abs(mel_diff.sum()), 1e-5)

                # check normalization ranges
                if self.ap.symmetric_norm:
                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
                    self.assertGreaterEqual(
                        mel_input.min(),
                        -self.ap.max_norm  # pylint: disable=invalid-unary-operand-type
                    )
                    self.assertLess(mel_input.min(), 0)
                else:
                    self.assertLessEqual(mel_input.max(), self.ap.max_norm)
                    self.assertGreaterEqual(mel_input.min(), 0)

    def test_batch_group_shuffle(self):
        if ok_ljspeech:
            dataloader, dataset = self._create_dataloader(2, c.r, 16)
            last_length = 0
            frames = dataset.samples
            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                mel_lengths = data["mel_lengths"]
                avg_length = mel_lengths.numpy().mean()
            dataloader.dataset.preprocess_samples()
            is_items_reordered = False
            for idx, item in enumerate(dataloader.dataset.samples):
                if item != frames[idx]:
                    is_items_reordered = True
                    break
            self.assertGreaterEqual(avg_length, last_length)
            self.assertTrue(is_items_reordered)

    def test_start_by_longest(self):
        """Test start_by_longest option.

        Ther first item of the fist batch must be longer than all the other items.
        """
        if ok_ljspeech:
            dataloader, _ = self._create_dataloader(2, c.r, 0, True)
            dataloader.dataset.preprocess_samples()
            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                mel_lengths = data["mel_lengths"]
                if i == 0:
                    max_len = mel_lengths[0]
                print(mel_lengths)
                self.assertTrue(all(max_len >= mel_lengths))

    def test_padding_and_spectrograms(self):
        def check_conditions(idx, linear_input, mel_input, stop_target,
                             mel_lengths):
            self.assertNotEqual(linear_input[idx, -1].sum(),
                                0)  # check padding
            self.assertNotEqual(linear_input[idx, -2].sum(), 0)
            self.assertNotEqual(mel_input[idx, -1].sum(), 0)
            self.assertNotEqual(mel_input[idx, -2].sum(), 0)
            self.assertEqual(stop_target[idx, -1], 1)
            self.assertEqual(stop_target[idx, -2], 0)
            self.assertEqual(stop_target[idx].sum(), 1)
            self.assertEqual(len(mel_lengths.shape), 1)
            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])

        if ok_ljspeech:
            dataloader, _ = self._create_dataloader(1, 1, 0)

            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                linear_input = data["linear"]
                mel_input = data["mel"]
                mel_lengths = data["mel_lengths"]
                stop_target = data["stop_targets"]
                item_idx = data["item_idxs"]

                # check mel_spec consistency
                wav = np.asarray(self.ap.load_wav(item_idx[0]),
                                 dtype=np.float32)
                mel = self.ap.melspectrogram(wav).astype("float32")
                mel = torch.FloatTensor(mel).contiguous()
                mel_dl = mel_input[0]
                # NOTE: Below needs to check == 0 but due to an unknown reason
                # there is a slight difference between two matrices.
                # TODO: Check this assert cond more in detail.
                self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)

                # check mel-spec correctness
                mel_spec = mel_input[0].cpu().numpy()
                wav = self.ap.inv_melspectrogram(mel_spec.T)
                self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
                shutil.copy(item_idx[0],
                            OUTPATH + "/mel_target_dataloader.wav")

                # check linear-spec
                linear_spec = linear_input[0].cpu().numpy()
                wav = self.ap.inv_spectrogram(linear_spec.T)
                self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
                shutil.copy(item_idx[0],
                            OUTPATH + "/linear_target_dataloader.wav")

                # check the outputs
                check_conditions(0, linear_input, mel_input, stop_target,
                                 mel_lengths)

            # Test for batch size 2
            dataloader, _ = self._create_dataloader(2, 1, 0)

            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
                linear_input = data["linear"]
                mel_input = data["mel"]
                mel_lengths = data["mel_lengths"]
                stop_target = data["stop_targets"]
                item_idx = data["item_idxs"]

                # set id to the longest sequence in the batch
                if mel_lengths[0] > mel_lengths[1]:
                    idx = 0
                else:
                    idx = 1

                # check the longer item in the batch
                check_conditions(idx, linear_input, mel_input, stop_target,
                                 mel_lengths)

                # check the other item in the batch
                self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
                self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
                self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
                self.assertEqual(stop_target[1, mel_lengths[1]:].sum(),
                                 stop_target.shape[1] - mel_lengths[1])
                self.assertEqual(len(mel_lengths.shape), 1)
示例#7
0
class Synthesizer(object):
    def __init__(self,
                 tts_checkpoint,
                 tts_config,
                 vocoder_checkpoint=None,
                 vocoder_config=None,
                 use_cuda=False):
        """General 🐸 TTS interface for inference. It takes a tts and a vocoder
        model and synthesize speech from the provided text.

        The text is divided into a list of sentences using `pysbd` and synthesize
        speech on each sentence separately.

        If you have certain special characters in your text, you need to handle
        them before providing the text to Synthesizer.

        TODO: handle multi-speaker and GST inference.

        Args:
            tts_checkpoint (str): path to the tts model file.
            tts_config (str): path to the tts config file.
            vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
            vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
            use_cuda (bool, optional): enable/disable cuda. Defaults to False.
        """
        self.tts_checkpoint = tts_checkpoint
        self.tts_config = tts_config
        self.vocoder_checkpoint = vocoder_checkpoint
        self.vocoder_config = vocoder_config
        self.use_cuda = use_cuda
        self.wavernn = None
        self.vocoder_model = None
        self.num_speakers = 0
        self.tts_speakers = None
        self.speaker_embedding_dim = None
        self.seg = self.get_segmenter("en")
        self.use_cuda = use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self.load_tts(tts_checkpoint, tts_config, use_cuda)
        self.output_sample_rate = self.tts_config.audio["sample_rate"]
        if vocoder_checkpoint:
            self.load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
            self.output_sample_rate = self.vocoder_config.audio["sample_rate"]

    @staticmethod
    def get_segmenter(lang):
        return pysbd.Segmenter(language=lang, clean=True)

    def load_speakers(self):
        # load speakers
        if self.model_config.use_speaker_embedding is not None:
            self.tts_speakers = load_speaker_mapping(
                self.tts_config.tts_speakers_json)
            self.num_speakers = len(self.tts_speakers)
        else:
            self.num_speakers = 0
        # set external speaker embedding
        if self.tts_config.use_external_speaker_embedding_file:
            speaker_embedding = self.tts_speakers[list(
                self.tts_speakers.keys())[0]]["embedding"]
            self.speaker_embedding_dim = len(speaker_embedding)

    def init_speaker(self, speaker_idx):
        # load speakers
        speaker_embedding = None
        if hasattr(self, "tts_speakers") and speaker_idx is not None:
            assert speaker_idx < len(
                self.tts_speakers
            ), f" [!] speaker_idx is out of the range. {speaker_idx} vs {len(self.tts_speakers)}"
            if self.tts_config.use_external_speaker_embedding_file:
                speaker_embedding = self.tts_speakers[speaker_idx]["embedding"]
        return speaker_embedding

    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement

        global symbols, phonemes

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        if "characters" in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)

        self.tts_model = setup_model(self.input_size,
                                     num_speakers=self.num_speakers,
                                     c=self.tts_config)
        self.tts_model.load_checkpoint(tts_config, tts_checkpoint, eval=True)
        if use_cuda:
            self.tts_model.cuda()

    def load_vocoder(self, model_file, model_config, use_cuda):
        self.vocoder_config = load_config(model_config)
        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config["audio"])
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()

    def save_wav(self, wav, path):
        wav = np.array(wav)
        self.ap.save_wav(wav, path, self.output_sample_rate)

    def split_into_sentences(self, text):
        return self.seg.segment(text)

    def tts(self, text, speaker_idx=None):
        start_time = time.time()
        wavs = []
        sens = self.split_into_sentences(text)
        print(" > Text splitted to sentences.")
        print(sens)

        speaker_embedding = self.init_speaker(speaker_idx)
        use_gl = self.vocoder_model is None

        for sen in sens:
            # synthesize voice
            waveform, _, _, mel_postnet_spec, _, _ = synthesis(
                self.tts_model,
                sen,
                self.tts_config,
                self.use_cuda,
                self.ap,
                speaker_idx,
                None,
                False,
                self.tts_config.enable_eos_bos_chars,
                use_gl,
                speaker_embedding=speaker_embedding,
            )
            if not use_gl:
                # denormalize tts output based on tts audio config
                mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
                device_type = "cuda" if self.use_cuda else "cpu"
                # renormalize spectrogram based on vocoder config
                vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
                # compute scale factor for possible sample rate mismatch
                scale_factor = [
                    1, self.vocoder_config["audio"]["sample_rate"] /
                    self.ap.sample_rate
                ]
                if scale_factor[1] != 1:
                    print(" > interpolating tts model output.")
                    vocoder_input = interpolate_vocoder_input(
                        scale_factor, vocoder_input)
                else:
                    vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
                # run vocoder model
                # [1, T, C]
                waveform = self.vocoder_model.inference(
                    vocoder_input.to(device_type))
            if self.use_cuda and not use_gl:
                waveform = waveform.cpu()
            if not use_gl:
                waveform = waveform.numpy()
            waveform = waveform.squeeze()

            # trim silence
            waveform = trim_silence(waveform, self.ap)

            wavs += list(waveform)
            wavs += [0] * 10000

        # compute stats
        process_time = time.time() - start_time
        audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
        print(f" > Processing time: {process_time}")
        print(f" > Real-time factor: {process_time / audio_time}")
        return wavs
示例#8
0
class TestAudio(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(TestAudio, self).__init__(*args, **kwargs)
        self.ap = AudioProcessor(**conf.audio)

    def test_audio_synthesis(self):
        """ 1. load wav
            2. set normalization parameters
            3. extract mel-spec
            4. invert to wav and save the output
        """
        print(" > Sanity check for the process wav -> mel -> wav")

        def _test(max_norm, signal_norm, symmetric_norm, clip_norm):
            self.ap.max_norm = max_norm
            self.ap.signal_norm = signal_norm
            self.ap.symmetric_norm = symmetric_norm
            self.ap.clip_norm = clip_norm
            wav = self.ap.load_wav(WAV_FILE)
            mel = self.ap.melspectrogram(wav)
            wav_ = self.ap.inv_mel_spectrogram(mel)
            file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\
                .format(max_norm, signal_norm, symmetric_norm, clip_norm)
            print(" | > Creating wav file at : ", file_name)
            self.ap.save_wav(wav_, OUT_PATH + file_name)

        # maxnorm = 1.0
        _test(1., False, False, False)
        _test(1., True, False, False)
        _test(1., True, True, False)
        _test(1., True, False, True)
        _test(1., True, True, True)
        # maxnorm = 4.0
        _test(4., False, False, False)
        _test(4., True, False, False)
        _test(4., True, True, False)
        _test(4., True, False, True)
        _test(4., True, True, True)

    def test_normalize(self):
        """Check normalization and denormalization for range values and consistency """
        print(" > Testing normalization and denormalization.")
        wav = self.ap.load_wav(WAV_FILE)
        self.ap.signal_norm = False
        x = self.ap.melspectrogram(wav)
        x_old = x

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.clip_norm = False
        self.ap.max_norm = 4.0
        x_norm = self.ap._normalize(x)
        print(x_norm.max(), " -- ", x_norm.min())
        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
        assert x_norm.min() >= 0 - 1, x_norm.min()
        # check denorm.
        x_ = self.ap._denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.clip_norm = True
        self.ap.max_norm = 4.0
        x_norm = self.ap._normalize(x)
        print(x_norm.max(), " -- ", x_norm.min())
        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= 0, x_norm.min()
        # check denorm.
        x_ = self.ap._denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.clip_norm = False
        self.ap.max_norm = 4.0
        x_norm = self.ap._normalize(x)
        print(x_norm.max(), " -- ", x_norm.min())
        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()
        assert x_norm.min() <= 0, x_norm.min()
        # check denorm.
        x_ = self.ap._denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.clip_norm = True
        self.ap.max_norm = 4.0
        x_norm = self.ap._normalize(x)
        print(x_norm.max(), " -- ", x_norm.min())
        assert (x_old - x).sum() == 0
        # check value range
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
        assert x_norm.min() <= 0, x_norm.min()
        # check denorm.
        x_ = self.ap._denormalize(x_norm)
        assert (x - x_).sum() < 1e-3, (x - x_).mean()

        self.ap.signal_norm = True
        self.ap.symmetric_norm = False
        self.ap.max_norm = 1.0
        x_norm = self.ap._normalize(x)
        print(x_norm.max(), " -- ", x_norm.min())
        assert (x_old - x).sum() == 0
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= 0, x_norm.min()
        x_ = self.ap._denormalize(x_norm)
        assert (x - x_).sum() < 1e-3

        self.ap.signal_norm = True
        self.ap.symmetric_norm = True
        self.ap.max_norm = 1.0
        x_norm = self.ap._normalize(x)
        print(x_norm.max(), " -- ", x_norm.min())
        assert (x_old - x).sum() == 0
        assert x_norm.max() <= self.ap.max_norm, x_norm.max()
        assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
        assert x_norm.min() < 0, x_norm.min()
        x_ = self.ap._denormalize(x_norm)
        assert (x - x_).sum() < 1e-3
示例#9
0
class Synthesizer(object):
    def __init__(self, config):
        self.wavernn = None
        self.vocoder_model = None
        self.config = config
        print(config)
        self.seg = self.get_segmenter("en")
        self.use_cuda = self.config.use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                      self.config.use_cuda)
        if self.config.vocoder_checkpoint:
            self.load_vocoder(self.config.vocoder_checkpoint,
                              self.config.vocoder_config, self.config.use_cuda)
        if self.config.wavernn_lib_path:
            self.load_wavernn(self.config.wavernn_lib_path,
                              self.config.wavernn_checkpoint,
                              self.config.wavernn_config, self.config.use_cuda)

    @staticmethod
    def get_segmenter(lang):
        return pysbd.Segmenter(language=lang, clean=True)

    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes

        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)

        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)

        if 'characters' in self.tts_config.keys():
            symbols, phonemes = make_symbols(**self.tts_config.characters)

        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # TODO: fix this for multi-speaker model - load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])
            print(f" > model reduction factor: {cp['r']}")

    def load_vocoder(self, model_file, model_config, use_cuda):
        self.vocoder_config = load_config(model_config)
        self.vocoder_model = setup_generator(self.vocoder_config)
        self.vocoder_model.load_state_dict(
            torch.load(model_file, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0
        self.vocoder_config = load_config(model_config)

        if use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()

    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(
            lib_path)  # set this if WaveRNN is not installed globally
        #pylint: disable=import-outside-toplevel
        from WaveRNN.models.wavernn import Model
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(model_config)
        # This is the default architecture we use for our models.
        # You might need to update it
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file, map_location="cpu")
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    def split_into_sentences(self, text):
        return self.seg.segment(text)

    def tts(self, text, speaker_id=None):
        start_time = time.time()
        wavs = []
        sens = self.split_into_sentences(text)
        print(sens)
        speaker_id = id_to_torch(speaker_id)
        if speaker_id is not None and self.use_cuda:
            speaker_id = speaker_id.cuda()

        for sen in sens:
            # preprocess the given text
            inputs = text_to_seqvec(sen, self.tts_config)
            inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
            inputs = inputs.unsqueeze(0)
            # synthesize voice
            _, postnet_output, _, _ = run_model_torch(self.tts_model, inputs,
                                                      self.tts_config, False,
                                                      speaker_id, None)
            if self.vocoder_model:
                # use native vocoder model
                vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
                wav = self.vocoder_model.inference(vocoder_input)
                if self.use_cuda:
                    wav = wav.cpu().numpy()
                else:
                    wav = wav.numpy()
                wav = wav.flatten()
            elif self.wavernn:
                # use 3rd paty wavernn
                vocoder_input = None
                if self.tts_config.model == "Tacotron":
                    vocoder_input = torch.FloatTensor(
                        self.ap.out_linear_to_mel(
                            linear_spec=postnet_output.T).T).T.unsqueeze(0)
                else:
                    vocoder_input = postnet_output[0].transpose(0,
                                                                1).unsqueeze(0)
                if self.use_cuda:
                    vocoder_input.cuda()
                wav = self.wavernn.generate(
                    vocoder_input,
                    batched=self.config.is_wavernn_batched,
                    target=11000,
                    overlap=550)
            else:
                # use GL
                if self.use_cuda:
                    postnet_output = postnet_output[0].cpu()
                else:
                    postnet_output = postnet_output[0]
                postnet_output = postnet_output.numpy()
                wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)

            # trim silence
            wav = trim_silence(wav, self.ap)

            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)

        # compute stats
        process_time = time.time() - start_time
        audio_time = len(wavs) / self.tts_config.audio['sample_rate']
        print(f" > Processing time: {process_time}")
        print(f" > Real-time factor: {process_time / audio_time}")
        return out
示例#10
0
class Synthesizer(object):
    def __init__(self, config):
        self.wavernn = None
        self.config = config
        self.use_cuda = config.use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self.load_tts(self.config.tts_path, self.config.tts_file,
                      self.config.tts_config, config.use_cuda)
        if self.config.wavernn_lib_path:
            self.load_wavernn(config.wavernn_lib_path, config.wavernn_path,
                              config.wavernn_file, config.wavernn_config,
                              config.use_cuda)

    def load_tts(self, model_path, model_file, model_config, use_cuda):
        tts_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_file)
        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > model file: ", model_file)
        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)
        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(
                os.path.join(model_path, self.config.tts_speakers))
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(self.model_file)
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]:
            self.tts_model.decoder.set_r(cp['r'])

    def load_wavernn(self, lib_path, model_path, model_file, model_config,
                     use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(lib_path)  # set this if TTS is not installed globally
        from WaveRNN.models.wavernn import Model
        wavernn_config = os.path.join(model_path, model_config)
        model_file = os.path.join(model_path, model_file)
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", wavernn_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(wavernn_config)
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file)
        self.wavernn.load_state_dict(check['model'])
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    def split_into_sentences(self, text):
        text = " " + text + "  "
        text = text.replace("\n", " ")
        text = re.sub(prefixes, "\\1<prd>", text)
        text = re.sub(websites, "<prd>\\1", text)
        if "Ph.D" in text:
            text = text.replace("Ph.D.", "Ph<prd>D<prd>")
        text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text)
        text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
        text = re.sub(
            alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
            "\\1<prd>\\2<prd>\\3<prd>", text)
        text = re.sub(alphabets + "[.]" + alphabets + "[.]",
                      "\\1<prd>\\2<prd>", text)
        text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2",
                      text)
        text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
        text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
        if "”" in text:
            text = text.replace(".”", "”.")
        if "\"" in text:
            text = text.replace(".\"", "\".")
        if "!" in text:
            text = text.replace("!\"", "\"!")
        if "?" in text:
            text = text.replace("?\"", "\"?")
        text = text.replace(".", ".<stop>")
        text = text.replace("?", "?<stop>")
        text = text.replace("!", "!<stop>")
        text = text.replace("<prd>", ".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = [s.strip() for s in sentences]
        return sentences

    def tts(self, text):
        wavs = []
        sens = self.split_into_sentences(text)
        print(sens)
        if not sens:
            sens = [text + '.']
        for sen in sens:
            # preprocess the given text
            inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)
            # synthesize voice
            decoder_output, postnet_output, alignments, _ = run_model(
                self.tts_model, inputs, self.tts_config, False, None, None)
            # convert outputs to numpy
            postnet_output, decoder_output, _ = parse_outputs(
                postnet_output, decoder_output, alignments)

            if self.wavernn:
                postnet_output = postnet_output[0].data.cpu().numpy()
                wav = self.wavernn.generate(
                    torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(),
                    batched=self.config.is_wavernn_batched,
                    target=11000,
                    overlap=550)
            else:
                wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
            # trim silence
            wav = trim_silence(wav, self.ap)

            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)
        return out
示例#11
0
class Synthesizer:
    def __init__(
        self,
        config_path,
        model_path,
        use_cuda=False,
        vocoder_path="",
        vocoder_config_path="",
        batched_vocoder=True,
        speakers_json="",
        speaker_fileid=None,
        gst_style=None,
        wavegrad_iters=50,
    ):
        self.config_path = config_path
        self.model_path = model_path
        self.use_cuda = use_cuda
        self.vocoder_path = vocoder_path
        self.vocoder_config_path = vocoder_config_path
        self.batched_vocoder = batched_vocoder
        self.speakers_json = speakers_json
        self.speaker_fileid = speaker_fileid
        self.gst_style = gst_style
        self.wavegrad_iters = wavegrad_iters

        self.model = None

    def load(self):
        # load the config
        C = load_config(self.config_path)
        self.config = C

        # Resolve scale_stats path
        stats_path = C.audio.get("stats_path")
        if stats_path and not os.path.isfile(stats_path):
            # Look for stats next to config
            model_stats_path = os.path.join(os.path.dirname(self.config_path),
                                            "scale_stats.npy")
            if os.path.isfile(model_stats_path):
                # Patch config
                C.audio["stats_path"] = model_stats_path
            else:
                _LOGGER.warning("No scale stats found at %s",
                                C.audio["stats_path"])
                C.audio["stats_path"] = ""

        C.forward_attn_mask = True

        if "gst" not in C.keys():
            # Patch config
            gst = {
                "gst_use_speaker_embedding": False,
                "gst_style_input": None,
                "gst_embedding_dim": 512,
                "gst_num_heads": 4,
                "gst_style_tokens": 10,
            }

            C["gst"] = gst
            setattr(C, "gst", gst)

        if "use_external_speaker_embedding_file" not in C.keys():
            C["use_external_speaker_embedding_file"] = False
            setattr(C, "use_external_speaker_embedding_file", False)

        if "gst_use_speaker_embedding" not in C.gst:
            C.gst["gst_use_speaker_embedding"] = False

        # load the audio processor
        ap = AudioProcessor(**C.audio)
        self.ap = ap

        # if the vocabulary was passed, replace the default
        if "characters" in C.keys():
            symbols, phonemes = make_symbols(**C.characters)
        else:
            from TTS.tts.utils.text.symbols import phonemes, symbols

        speaker_embedding = None
        speaker_embedding_dim = None
        num_speakers = 0

        # load speakers
        if self.speakers_json != "":
            speaker_mapping = json.load(open(self.speakers_json, "r"))
            num_speakers = len(speaker_mapping)
            if C.use_external_speaker_embedding_file:
                if self.speaker_fileid is not None:
                    speaker_embedding = speaker_mapping[
                        self.speaker_fileid]["embedding"]
                else:  # if speaker_fileid is not specificated use the first sample in speakers.json
                    speaker_embedding = speaker_mapping[list(
                        speaker_mapping.keys())[0]]["embedding"]
                speaker_embedding_dim = len(speaker_embedding)

        self.speaker_embedding = speaker_embedding

        # load the model
        num_chars = len(phonemes) if C.use_phonemes else len(symbols)
        model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
        cp = torch.load(self.model_path, map_location=torch.device("cpu"))
        model.load_state_dict(cp["model"])
        model.eval()
        if self.use_cuda:
            model.cuda()

        if hasattr(model.decoder, "set_r"):
            model.decoder.set_r(cp["r"])

        self.model = model

        # load vocoder model
        if self.vocoder_path:
            VC = load_config(self.vocoder_config_path)
            # Resolve scale_stats path
            stats_path = VC.audio.get("stats_path")
            if stats_path and not os.path.isfile(stats_path):
                # Look for stats next to config
                vocoder_stats_path = os.path.join(
                    os.path.dirname(self.vocoder_config_path),
                    "scale_stats.npy")
                if os.path.isfile(vocoder_stats_path):
                    # Patch config
                    VC.audio["stats_path"] = vocoder_stats_path
                else:
                    # Try next to TTS config
                    vocoder_stats_path = os.path.join(
                        os.path.dirname(self.config_path), "scale_stats.npy")
                    if os.path.isfile(vocoder_stats_path):
                        # Patch config
                        VC.audio["stats_path"] = vocoder_stats_path
                    else:
                        _LOGGER.warning("No vocoder scale stats found at %s",
                                        VC.audio["stats_path"])
                        VC.audio["stats_path"] = ""

            self.ap_vocoder = AudioProcessor(**VC.audio)

            vocoder_model = setup_generator(VC)
            vocoder_model.load_state_dict(
                torch.load(self.vocoder_path, map_location="cpu")["model"])
            vocoder_model.remove_weight_norm()
            vocoder_model.inference_padding = 0
            if self.use_cuda:
                vocoder_model.cuda()
            vocoder_model.eval()

            if hasattr(vocoder_model, "compute_noise_level"):
                noise_schedule_path = os.path.join(
                    os.path.dirname(self.vocoder_path), "noise_schedule.npy")
                if os.path.isfile(noise_schedule_path):
                    _LOGGER.debug("Loading noise schedule from %s",
                                  noise_schedule_path)
                    beta = np.load(noise_schedule_path,
                                   allow_pickle=True).tolist()["beta"]
                else:
                    # Use if not computed noise schedule with tune_wavegrad
                    _LOGGER.debug("Using default noise schedule")
                    beta = np.linspace(1e-6, 0.01, self.wavegrad_iters)

                vocoder_model.compute_noise_level(beta)
        else:
            vocoder_model = None
            VC = None
            self.ap_vocoder = None

        self.vocoder_model = vocoder_model
        self.vocoder_config = VC

        # synthesize voice
        self.use_griffin_lim = self.vocoder_model is None

        if not C.use_external_speaker_embedding_file:
            if self.speaker_fileid and self.speaker_fileid.isdigit():
                self.speaker_fileid = int(self.speaker_fileid)
            else:
                self.speaker_fileid = None
        else:
            self.speaker_fileid = None

        if (self.gst_style is None) and ("gst" in C.keys()):
            gst_style = C.gst.get("gst_style_input", None)
        else:
            # check if gst_style string is a dict, if is dict convert  else use string
            try:
                gst_style = json.loads(self.gst_style)
                if max(map(int,
                           gst_style.keys())) >= C.gst["gst_style_tokens"]:
                    raise RuntimeError(
                        "The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}"
                        .format(max(map(int, gst_style.keys())),
                                C.gst["gst_style_tokens"]))
            except ValueError:
                gst_style = self.gst_style

        self.gst_style = gst_style

        # Pre-load language
        if C.get("phoneme_backend") == "gruut":
            load_gruut_language(C["phoneme_language"])

        # Compute scale factors in case TTS/vocoder sample rates differ
        # See: https://github.com/mozilla/TTS/issues/520
        self.scale_factors = None

        if self.ap_vocoder and (self.ap.sample_rate !=
                                self.ap_vocoder.sample_rate):
            self.scale_factors = (1, self.ap_vocoder.sample_rate /
                                  self.ap.sample_rate)

    @property
    def sample_rate(self) -> int:
        """Get output sample rate"""
        if self.ap_vocoder:
            return self.ap_vocoder.sample_rate

        return self.ap.sample_rate

    # -------------------------------------------------------------------------

    def synthesize(self, text: str, text_is_phonemes: bool = False) -> bytes:
        """Synthesize WAV bytes from text"""
        if not self.model:
            self.load()

        wav = tts(
            self.model,
            self.vocoder_model,
            text,
            self.config,
            self.use_cuda,
            self.ap,
            self.use_griffin_lim,
            self.speaker_fileid,
            speaker_embedding=self.speaker_embedding,
            gst_style=self.gst_style,
            text_is_phonemes=text_is_phonemes,
            ap_vocoder=self.ap_vocoder,
            scale_factors=self.scale_factors,
        )

        with io.BytesIO() as wav_io:
            if self.ap_vocoder:
                # Use vocoder sample rate
                self.ap_vocoder.save_wav(wav, wav_io)
            else:
                # Use original sample rate
                self.ap.save_wav(wav, wav_io)

            return wav_io.getvalue()
示例#12
0
def main(**kwargs):
    global symbols, phonemes # pylint: disable=global-statement
    current_date = date.today()
    current_date = current_date.strftime("%B %d %Y")
    start_time = time.time()

    # read passed variables from gui
    text = kwargs['text']                           # text to generate speech from
    use_cuda = kwargs['use_cuda']                   # if gpu exists default is true
    project = kwargs['project']                     # path to project folder
    vocoder_type = kwargs['vocoder']                # vocoder type, default is GL
    use_gst = kwargs['use_gst']                     # use style_wave for prosody
    style_dict = kwargs['style_input']              # use style_wave for prosody
    speaker_id = kwargs['speaker_id']               # name of the selected speaker
    sentence_file = kwargs['sentence_file']         # path to file if generate from file
    out_path = kwargs['out_path']                   # path to save the output wav

    batched_vocoder = True

    # load speakers
    speakers_file_path = Path(project, "speakers.json")
    if speakers_file_path.is_file():
        speaker_data = json.load(open(speakers_file_path, 'r'))
        num_speakers = len(speaker_data)
        #get the speaker id for selected speaker
        if speaker_id >= num_speakers:
            print('Speaker ID outside of number of speakers range. Using default 0.')
            speaker_id = 0
            speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0]
        else:
            speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0]
    else:
        speaker_name = 'Default'
        num_speakers = 0
        speaker_id = None

    # load the config
    config_path = Path(project, "config.json")
    C = load_config(config_path)

    if use_gst:
        if style_dict is not None:
            style_input = style_dict
    else:
        style_input = None

    # load the audio processor
    ap = AudioProcessor(**C.audio)

    # if the vocabulary was passed, replace the default
    if 'characters' in C.keys():
        symbols, phonemes = make_symbols(**C.characters)
        

    # find the tts model file in project folder
    try:
        tts_model_file = glob(str(Path(project, '*.pth.tar')))
        if not tts_model_file:
            raise FileNotFoundError
        model_path = tts_model_file[0]
    except FileNotFoundError:
        print('[!] TTS Model not found in path: "{}"'.format(project))

    # load the model
    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
    model = setup_model(num_chars, num_speakers, C)

    # if gpu is not available use cpu
    model, state = load_checkpoint(model, model_path, use_cuda=use_cuda)

    model.decoder.max_decoder_steps = 2000

    model.eval()
    print(' > Model step:', state['step'])
    print(' > Model r: ', state['r'])

    # load vocoder
    if vocoder_type is 'MelGAN':
        try:
            model_file = glob(str(Path(project, 'vocoder/*.pth.tar')))
            vocoder, ap_vocoder = load_vocoder(str(Path('TTS')),
                                               str(model_file[0]),
                                               str(Path(project, 'vocoder/config.json')),
                                               use_cuda)
        except Exception:
            print('[!] Error loading vocoder: "{}"'.format(project))
            sys.exit(0)

    elif vocoder_type is 'WaveRNN':
        try:
            model_file = glob(str(Path(project, 'vocoder/*.pkl')))
            vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'config.yml')), use_cuda)
        except Exception:
            print('[!] Error loading vocoder: "{}"'.format(project))
            sys.exit(0)
    else:
        vocoder, ap_vocoder = None, None

    print(" > Vocoder: {}".format(vocoder_type))
    print(' > Using style input: {}\n'.format(style_input))

    if sentence_file != '':
        with open(sentence_file, "r", encoding='utf8') as f:
            list_of_sentences = [s.strip() for s in f.readlines()]
    else:
        list_of_sentences = [text.strip()]

    # iterate over every passed sentence and synthesize
    for _, tts_sentence in enumerate(list_of_sentences):
        wav_list = []
        # remove character which are not alphanumerical or contain ',. '
        tts_sentence = clean_sentence(tts_sentence) 
        print(" > Text: {}".format(tts_sentence))
        # build filename
        current_time = datetime.now().strftime("%H%M%S")
        file_name = ' '.join(tts_sentence.split(" ")[:10])
        # if multiple sentences in one line -> split them
        tts_sentence = split_into_sentences(tts_sentence)
        
        # if sentence was split in sub-sentences -> iterate over them
        for sentence in tts_sentence:
            # synthesize voice
            _, _, _, wav = tts(model,
                               vocoder,
                               C,
                               None,
                               sentence,
                               ap,
                               ap_vocoder,
                               use_cuda,
                               batched_vocoder,
                               speaker_id=speaker_id,
                               style_input=style_input,
                               figures=False)

            # join sub-sentences back together and add a filler between them
            wav_list += list(wav)
            wav_list += [0] * 10000

        wav = np.array(wav_list)

        # finalize filename
        file_name = "_".join([str(current_time), file_name])
        file_name = file_name.translate(
            str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'

        if out_path == "":
            out_dir = str(Path(project, 'output', current_date, speaker_name))
            out_path = os.path.join(out_dir, file_name)
        else:
            out_dir = os.path.dirname(out_path)

        # create output directory if it doesn't exist
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir, exist_ok=True)

        # save generated wav to disk
        ap.save_wav(wav, out_path)
        end_time = time.time()
        print(" > Run-time: {}".format(end_time - start_time))
        print(" > Saving output to {}\n".format(out_path))
示例#13
0
class Synthesizer(object):
    def __init__(
        self,
        tts_checkpoint: str,
        tts_config_path: str,
        tts_speakers_file: str = "",
        tts_languages_file: str = "",
        vocoder_checkpoint: str = "",
        vocoder_config: str = "",
        encoder_checkpoint: str = "",
        encoder_config: str = "",
        use_cuda: bool = False,
    ) -> None:
        """General 🐸 TTS interface for inference. It takes a tts and a vocoder
        model and synthesize speech from the provided text.

        The text is divided into a list of sentences using `pysbd` and synthesize
        speech on each sentence separately.

        If you have certain special characters in your text, you need to handle
        them before providing the text to Synthesizer.

        TODO: set the segmenter based on the source language

        Args:
            tts_checkpoint (str): path to the tts model file.
            tts_config_path (str): path to the tts config file.
            vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
            vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
            encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`,
            encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`,
            use_cuda (bool, optional): enable/disable cuda. Defaults to False.
        """
        self.tts_checkpoint = tts_checkpoint
        self.tts_config_path = tts_config_path
        self.tts_speakers_file = tts_speakers_file
        self.tts_languages_file = tts_languages_file
        self.vocoder_checkpoint = vocoder_checkpoint
        self.vocoder_config = vocoder_config
        self.encoder_checkpoint = encoder_checkpoint
        self.encoder_config = encoder_config
        self.use_cuda = use_cuda

        self.tts_model = None
        self.vocoder_model = None
        self.speaker_manager = None
        self.num_speakers = 0
        self.tts_speakers = {}
        self.language_manager = None
        self.num_languages = 0
        self.tts_languages = {}
        self.d_vector_dim = 0
        self.seg = self._get_segmenter("en")
        self.use_cuda = use_cuda

        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self._load_tts(tts_checkpoint, tts_config_path, use_cuda)
        self.output_sample_rate = self.tts_config.audio["sample_rate"]
        if vocoder_checkpoint:
            self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
            self.output_sample_rate = self.vocoder_config.audio["sample_rate"]
        else:
            print(" > Using Griffin-Lim as no vocoder model defined")

    @staticmethod
    def _get_segmenter(lang: str):
        """get the sentence segmenter for the given language.

        Args:
            lang (str): target language code.

        Returns:
            [type]: [description]
        """
        return pysbd.Segmenter(language=lang, clean=True)

    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        1. Load the model config.
        2. Init the AudioProcessor.
        3. Init the model from the config.
        4. Move the model to the GPU if CUDA is enabled.
        5. Init the speaker manager for the model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement

        self.tts_config = load_config(tts_config_path)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        speaker_manager = self._init_speaker_manager()
        language_manager = self._init_language_manager()
        if not self.encoder_checkpoint:
            self._set_speaker_encoder_paths_from_tts_config()
        speaker_manager = self._init_speaker_encoder(speaker_manager)

        if language_manager is not None:
            self.tts_model = setup_tts_model(
                config=self.tts_config,
                speaker_manager=speaker_manager,
                language_manager=language_manager,
            )
        else:
            self.tts_model = setup_tts_model(config=self.tts_config,
                                             speaker_manager=speaker_manager)
        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()

    def _set_speaker_encoder_paths_from_tts_config(self):
        """Set the encoder paths from the tts model config for models with speaker encoders."""
        if hasattr(self.tts_config, "model_args") and hasattr(
                self.tts_config.model_args, "speaker_encoder_config_path"):
            self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path
            self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path

    def _is_use_speaker_embedding(self):
        """Check if the speaker embedding is used in the model"""
        # we handle here the case that some models use model_args some don't
        use_speaker_embedding = False
        if hasattr(self.tts_config, "model_args"):
            use_speaker_embedding = self.tts_config["model_args"].get(
                "use_speaker_embedding", False)
        use_speaker_embedding = use_speaker_embedding or self.tts_config.get(
            "use_speaker_embedding", False)
        return use_speaker_embedding

    def _is_use_d_vector_file(self):
        """Check if the d-vector file is used in the model"""
        # we handle here the case that some models use model_args some don't
        use_d_vector_file = False
        if hasattr(self.tts_config, "model_args"):
            config = self.tts_config.model_args
            use_d_vector_file = config.get("use_d_vector_file", False)
        config = self.tts_config
        use_d_vector_file = use_d_vector_file or config.get(
            "use_d_vector_file", False)
        return use_d_vector_file

    def _init_speaker_manager(self):
        """Initialize the SpeakerManager"""
        # setup if multi-speaker settings are in the global model config
        speaker_manager = None
        speakers_file = get_from_config_or_model_args_with_default(
            self.tts_config, "speakers_file", None)
        if self._is_use_speaker_embedding():
            if self.tts_speakers_file:
                speaker_manager = SpeakerManager(
                    speaker_id_file_path=self.tts_speakers_file)
            elif speakers_file:
                speaker_manager = SpeakerManager(
                    speaker_id_file_path=speakers_file)

        if self._is_use_d_vector_file():
            d_vector_file = get_from_config_or_model_args_with_default(
                self.tts_config, "d_vector_file", None)
            if self.tts_speakers_file:
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=self.tts_speakers_file)
            elif d_vector_file:
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=d_vector_file)
        return speaker_manager

    def _init_speaker_encoder(self, speaker_manager):
        """Initialize the SpeakerEncoder"""
        if self.encoder_checkpoint:
            if speaker_manager is None:
                speaker_manager = SpeakerManager(
                    encoder_model_path=self.encoder_checkpoint,
                    encoder_config_path=self.encoder_config)
            else:
                speaker_manager.init_speaker_encoder(self.encoder_checkpoint,
                                                     self.encoder_config)
        return speaker_manager

    def _init_language_manager(self):
        """Initialize the LanguageManager"""
        # setup if multi-lingual settings are in the global model config
        language_manager = None
        if check_config_and_model_args(self.tts_config,
                                       "use_language_embedding", True):
            if self.tts_languages_file:
                language_manager = LanguageManager(
                    language_ids_file_path=self.tts_languages_file)
            elif self.tts_config.get("language_ids_file", None):
                language_manager = LanguageManager(
                    language_ids_file_path=self.tts_config.language_ids_file)
            else:
                language_manager = LanguageManager(config=self.tts_config)
        return language_manager

    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        1. Load the vocoder config.
        2. Init the AudioProcessor for the vocoder.
        3. Init the vocoder model from the config.
        4. Move the model to the GPU if CUDA is enabled.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)
        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config.audio)
        self.vocoder_model = setup_vocoder_model(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()

    def split_into_sentences(self, text) -> List[str]:
        """Split give text into sentences.

        Args:
            text (str): input text in string format.

        Returns:
            List[str]: list of sentences.
        """
        return self.seg.segment(text)

    def save_wav(self, wav: List[int], path: str) -> None:
        """Save the waveform as a file.

        Args:
            wav (List[int]): waveform as a list of values.
            path (str): output path to save the waveform.
        """
        wav = np.array(wav)
        self.ap.save_wav(wav, path, self.output_sample_rate)

    def tts(
        self,
        text: str,
        speaker_name: str = "",
        language_name: str = "",
        speaker_wav: Union[str, List[str]] = None,
        style_wav=None,
    ) -> List[int]:
        """🐸 TTS magic. Run all the models and generate speech.

        Args:
            text (str): input text.
            speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "".
            language_name (str, optional): language id for multi-language models. Defaults to "".
            speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
            style_wav ([type], optional): style waveform for GST. Defaults to None.

        Returns:
            List[int]: [description]
        """
        start_time = time.time()
        wavs = []
        sens = self.split_into_sentences(text)
        print(" > Text splitted to sentences.")
        print(sens)

        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager,
                                             "speaker_ids"):
            if speaker_name and isinstance(speaker_name, str):
                if self.tts_config.use_d_vector_file:
                    # get the speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(
                        speaker_name)[0]
                    speaker_embedding = np.array(speaker_embedding)[
                        None, :]  # [1 x embedding_dim]
                else:
                    # get speaker idx from the speaker name
                    speaker_id = self.tts_model.speaker_manager.speaker_ids[
                        speaker_name]

            elif not speaker_name and not speaker_wav:
                raise ValueError(
                    " [!] Look like you use a multi-speaker model. "
                    "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model."
                )
            else:
                speaker_embedding = None
        else:
            if speaker_name:
                raise ValueError(
                    f" [!] Missing speakers.json file path for selecting speaker {speaker_name}."
                    "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
                )

        # handle multi-lingaul
        language_id = None
        if self.tts_languages_file or (
                hasattr(self.tts_model, "language_manager")
                and self.tts_model.language_manager is not None):
            if language_name and isinstance(language_name, str):
                language_id = self.tts_model.language_manager.language_id_mapping[
                    language_name]

            elif not language_name:
                raise ValueError(
                    " [!] Look like you use a multi-lingual model. "
                    "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model."
                )

            else:
                raise ValueError(
                    f" [!] Missing language_ids.json file path for selecting language {language_name}."
                    "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. "
                )

        # compute a new d_vector from the given clip.
        if speaker_wav is not None:
            speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(
                speaker_wav)

        use_gl = self.vocoder_model is None

        for sen in sens:
            # synthesize voice
            outputs = synthesis(
                model=self.tts_model,
                text=sen,
                CONFIG=self.tts_config,
                use_cuda=self.use_cuda,
                ap=self.ap,
                speaker_id=speaker_id,
                language_id=language_id,
                language_name=language_name,
                style_wav=style_wav,
                enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars,
                use_griffin_lim=use_gl,
                d_vector=speaker_embedding,
            )
            waveform = outputs["wav"]
            mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach(
            ).cpu().numpy()
            if not use_gl:
                # denormalize tts output based on tts audio config
                mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
                device_type = "cuda" if self.use_cuda else "cpu"
                # renormalize spectrogram based on vocoder config
                vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
                # compute scale factor for possible sample rate mismatch
                scale_factor = [
                    1,
                    self.vocoder_config["audio"]["sample_rate"] /
                    self.ap.sample_rate,
                ]
                if scale_factor[1] != 1:
                    print(" > interpolating tts model output.")
                    vocoder_input = interpolate_vocoder_input(
                        scale_factor, vocoder_input)
                else:
                    vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
                # run vocoder model
                # [1, T, C]
                waveform = self.vocoder_model.inference(
                    vocoder_input.to(device_type))
            if self.use_cuda and not use_gl:
                waveform = waveform.cpu()
            if not use_gl:
                waveform = waveform.numpy()
            waveform = waveform.squeeze()

            # trim silence
            if self.tts_config.audio["do_trim_silence"] is True:
                waveform = trim_silence(waveform, self.ap)

            wavs += list(waveform)
            wavs += [0] * 10000

        # compute stats
        process_time = time.time() - start_time
        audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
        print(f" > Processing time: {process_time}")
        print(f" > Real-time factor: {process_time / audio_time}")
        return wavs
示例#14
0
class Synthesizer(object):
    def __init__(self, config):
        self.wavernn = None
        self.pwgan = None
        self.config = config
        self.use_cuda = self.config.use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
                      self.config.use_cuda)
        if self.config.wavernn_lib_path:
            self.load_wavernn(self.config.wavernn_lib_path,
                              self.config.wavernn_file,
                              self.config.wavernn_config, self.config.use_cuda)
        if self.config.pwgan_file:
            self.load_pwgan(self.config.pwgan_file, self.config.pwgan_config,
                            self.config.use_cuda)

    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        print(" > Loading TTS model ...")
        print(" | > model config: ", tts_config)
        print(" | > checkpoint file: ", tts_checkpoint)
        self.tts_config = load_config(tts_config)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(**self.tts_config.audio)
        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        # TODO: fix this for multi-speaker model - load speakers
        if self.config.tts_speakers is not None:
            self.tts_speakers = load_speaker_mapping(self.config.tts_speakers)
            num_speakers = len(self.tts_speakers)
        else:
            num_speakers = 0
        self.tts_model = setup_model(self.input_size,
                                     num_speakers=num_speakers,
                                     c=self.tts_config)
        # load model state
        cp = torch.load(tts_checkpoint, map_location=torch.device('cpu'))
        # load the model
        self.tts_model.load_state_dict(cp['model'])
        if use_cuda:
            self.tts_model.cuda()
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 3000
        if 'r' in cp:
            self.tts_model.decoder.set_r(cp['r'])

    def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
        # TODO: set a function in wavernn code base for model setup and call it here.
        sys.path.append(
            lib_path)  # set this if WaveRNN is not installed globally
        #pylint: disable=import-outside-toplevel
        from WaveRNN.models.wavernn import Model
        print(" > Loading WaveRNN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        self.wavernn_config = load_config(model_config)
        # This is the default architecture we use for our models.
        # You might need to update it
        self.wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=self.wavernn_config.mode,
            mulaw=self.wavernn_config.mulaw,
            pad=self.wavernn_config.pad,
            use_aux_net=self.wavernn_config.use_aux_net,
            use_upsample_net=self.wavernn_config.use_upsample_net,
            upsample_factors=self.wavernn_config.upsample_factors,
            feat_dims=80,
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=self.ap.hop_length,
            sample_rate=self.ap.sample_rate,
        ).cuda()

        check = torch.load(model_file)
        self.wavernn.load_state_dict(check['model'], map_location="cpu")
        if use_cuda:
            self.wavernn.cuda()
        self.wavernn.eval()

    def load_pwgan(self, model_file, model_config, use_cuda):
        #pylint: disable=import-outside-toplevel
        from parallel_wavegan.models import ParallelWaveGANGenerator
        from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder
        print(" > Loading PWGAN model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", model_file)
        with open(model_config) as f:
            self.pwgan_config = yaml.load(f, Loader=yaml.Loader)
        self.pwgan = ParallelWaveGANGenerator(
            **self.pwgan_config["generator_params"])
        self.pwgan.load_state_dict(
            torch.load(model_file, map_location="cpu")["model"]["generator"])
        self.pwgan.remove_weight_norm()
        self.pwgan_ap = AudioProcessorVocoder(**self.pwgan_config["audio"])
        if use_cuda:
            self.pwgan.cuda()
        self.pwgan.eval()

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    def split_into_sentences(self, text):
        text = " " + text + " <stop>"
        text = text.replace("\n", " ")
        text = re.sub(prefixes, "\\1<prd>", text)
        text = re.sub(websites, "<prd>\\1", text)
        if "Ph.D" in text:
            text = text.replace("Ph.D.", "Ph<prd>D<prd>")
        text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text)
        text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
        text = re.sub(
            alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",
            "\\1<prd>\\2<prd>\\3<prd>", text)
        text = re.sub(alphabets + "[.]" + alphabets + "[.]",
                      "\\1<prd>\\2<prd>", text)
        text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2",
                      text)
        text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
        text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
        if "”" in text:
            text = text.replace(".”", "”.")
        if "\"" in text:
            text = text.replace(".\"", "\".")
        if "!" in text:
            text = text.replace("!\"", "\"!")
        if "?" in text:
            text = text.replace("?\"", "\"?")
        text = text.replace(".", ".<stop>")
        text = text.replace("?", "?<stop>")
        text = text.replace("!", "!<stop>")
        text = text.replace("<prd>", ".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = list(filter(None, [s.strip() for s in sentences]))
        return sentences

    def tts(self, text):
        wavs = []
        sens = self.split_into_sentences(text)
        print(sens)
        if not sens:
            sens = [text + '.']
        for sen in sens:
            # preprocess the given text
            inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)
            # synthesize voice
            decoder_output, postnet_output, alignments, _ = run_model(
                self.tts_model, inputs, self.tts_config, False, None, None)
            # convert outputs to numpy
            postnet_output, decoder_output, _ = parse_outputs(
                postnet_output, decoder_output, alignments)

            if self.pwgan:
                input_tensor = torch.FloatTensor(postnet_output.T).unsqueeze(0)
                if self.use_cuda:
                    input_tensor.cuda()
                wav = self.pwgan.inference(
                    input_tensor,
                    hop_size=self.pwgan_ap.hop_length).data.cpu().numpy()
            else:
                wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
            # trim silence
            wav = trim_silence(wav, self.ap)

            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)
        return out
示例#15
0
class Synthesizer(object):
    def __init__(
        self,
        tts_checkpoint: str,
        tts_config_path: str,
        tts_speakers_file: str = "",
        vocoder_checkpoint: str = "",
        vocoder_config: str = "",
        encoder_checkpoint: str = "",
        encoder_config: str = "",
        use_cuda: bool = False,
    ) -> None:
        """General 🐸 TTS interface for inference. It takes a tts and a vocoder
        model and synthesize speech from the provided text.

        The text is divided into a list of sentences using `pysbd` and synthesize
        speech on each sentence separately.

        If you have certain special characters in your text, you need to handle
        them before providing the text to Synthesizer.

        TODO: set the segmenter based on the source language

        Args:
            tts_checkpoint (str): path to the tts model file.
            tts_config_path (str): path to the tts config file.
            vocoder_checkpoint (str, optional): path to the vocoder model file. Defaults to None.
            vocoder_config (str, optional): path to the vocoder config file. Defaults to None.
            encoder_checkpoint (str, optional): path to the speaker encoder model file. Defaults to `""`,
            encoder_config (str, optional): path to the speaker encoder config file. Defaults to `""`,
            use_cuda (bool, optional): enable/disable cuda. Defaults to False.
        """
        self.tts_checkpoint = tts_checkpoint
        self.tts_config_path = tts_config_path
        self.tts_speakers_file = tts_speakers_file
        self.vocoder_checkpoint = vocoder_checkpoint
        self.vocoder_config = vocoder_config
        self.encoder_checkpoint = encoder_checkpoint
        self.encoder_config = encoder_config
        self.use_cuda = use_cuda

        self.tts_model = None
        self.vocoder_model = None
        self.speaker_manager = None
        self.num_speakers = 0
        self.tts_speakers = {}
        self.d_vector_dim = 0
        self.seg = self._get_segmenter("en")
        self.use_cuda = use_cuda

        if self.use_cuda:
            assert torch.cuda.is_available(
            ), "CUDA is not availabe on this machine."
        self._load_tts(tts_checkpoint, tts_config_path, use_cuda)
        self.output_sample_rate = self.tts_config.audio["sample_rate"]
        if vocoder_checkpoint:
            self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
            self.output_sample_rate = self.vocoder_config.audio["sample_rate"]

    @staticmethod
    def _get_segmenter(lang: str):
        """get the sentence segmenter for the given language.

        Args:
            lang (str): target language code.

        Returns:
            [type]: [description]
        """
        return pysbd.Segmenter(language=lang, clean=True)

    def _load_speakers(self, speaker_file: str) -> None:
        """Load the SpeakerManager to organize multi-speaker TTS. It loads the speakers meta-data and the speaker
        encoder if it is defined.

        Args:
            speaker_file (str): path to the speakers meta-data file.
        """
        print("Loading speakers ...")
        self.speaker_manager = SpeakerManager(
            encoder_model_path=self.encoder_checkpoint,
            encoder_config_path=self.encoder_config,
        )
        self.speaker_manager.load_d_vectors_file(
            self.tts_config.get("d_vector_file", speaker_file))
        self.num_speakers = self.speaker_manager.num_speakers
        self.d_vector_dim = self.speaker_manager.d_vector_dim

    def _set_tts_speaker_file(self):
        """Set the TTS speaker file used by a multi-speaker model."""
        # setup if multi-speaker settings are in the global model config
        if (hasattr(self.tts_config, "use_speaker_embedding")
                and self.tts_config.use_speaker_embedding is True):
            if self.tts_config.use_d_vector_file:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          self.tts_config["d_vector_file"])
                self.tts_config["d_vector_file"] = self.tts_speakers_file
            else:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          self.tts_config["speakers_file"])

        # setup if multi-speaker settings are in the model args config
        if (self.tts_speakers_file is None
                and hasattr(self.tts_config, "model_args") and hasattr(
                    self.tts_config.model_args, "use_speaker_embedding")
                and self.tts_config.model_args.use_speaker_embedding):
            _args = self.tts_config.model_args
            if _args.use_d_vector_file:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          _args["d_vector_file"])
                _args["d_vector_file"] = self.tts_speakers_file
            else:
                self.tts_speakers_file = (self.tts_speakers_file
                                          if self.tts_speakers_file else
                                          _args["speakers_file"])

    def _load_tts(self, tts_checkpoint: str, tts_config_path: str,
                  use_cuda: bool) -> None:
        """Load the TTS model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
            tts_config_path (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        # pylint: disable=global-statement

        self.tts_config = load_config(tts_config_path)

        # Patch stats_path
        stats_path = self.tts_config["audio"].get("stats_path", "")
        if stats_path and (not os.path.isfile(stats_path)):
            stats_path = os.path.join(os.path.dirname(tts_checkpoint),
                                      os.path.split(stats_path)[1])
            self.tts_config["audio"]["stats_path"] = stats_path

        # Patch speakers file
        speakers_file = self.tts_config.get("model_args",
                                            {}).get("speakers_file", "")
        if speakers_file and (not os.path.isfile(speakers_file)):
            speakers_file = os.path.join(os.path.dirname(tts_checkpoint),
                                         os.path.split(speakers_file)[1])
            self.tts_config["model_args"]["speakers_file"] = speakers_file

        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)

        self.tts_model = setup_tts_model(config=self.tts_config)
        self.tts_model.load_checkpoint(self.tts_config,
                                       tts_checkpoint,
                                       eval=True)
        if use_cuda:
            self.tts_model.cuda()
        self._set_tts_speaker_file()

    def _load_vocoder(self, model_file: str, model_config: str,
                      use_cuda: bool) -> None:
        """Load the vocoder model.

        Args:
            model_file (str): path to the model checkpoint.
            model_config (str): path to the model config file.
            use_cuda (bool): enable/disable CUDA use.
        """
        self.vocoder_config = load_config(model_config)

        # Patch stats_path
        stats_path = self.vocoder_config["audio"].get("stats_path", "")
        if stats_path and (not os.path.isfile(stats_path)):
            stats_path = os.path.join(os.path.dirname(model_file),
                                      os.path.split(stats_path)[1])
            self.vocoder_config["audio"]["stats_path"] = stats_path

        self.vocoder_ap = AudioProcessor(verbose=False,
                                         **self.vocoder_config.audio)
        self.vocoder_model = setup_vocoder_model(self.vocoder_config)
        self.vocoder_model.load_checkpoint(self.vocoder_config,
                                           model_file,
                                           eval=True)
        if use_cuda:
            self.vocoder_model.cuda()

    def split_into_sentences(self, text) -> List[str]:
        """Split give text into sentences.

        Args:
            text (str): input text in string format.

        Returns:
            List[str]: list of sentences.
        """
        return self.seg.segment(text)

    def save_wav(self, wav: List[int], path: str) -> None:
        """Save the waveform as a file.

        Args:
            wav (List[int]): waveform as a list of values.
            path (str): output path to save the waveform.
        """
        wav = np.array(wav)
        self.ap.save_wav(wav, path, self.output_sample_rate)

    def tts(self,
            text: str,
            speaker_idx: str = "",
            speaker_wav=None,
            style_wav=None) -> List[int]:
        """🐸 TTS magic. Run all the models and generate speech.

        Args:
            text (str): input text.
            speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "".
            speaker_wav ():
            style_wav ([type], optional): style waveform for GST. Defaults to None.

        Returns:
            List[int]: [description]
        """
        start_time = time.time()
        wavs = []
        sens = self.split_into_sentences(text)
        print(" > Text splitted to sentences.")
        print(sens)

        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
        if isinstance(speaker_idx, int):
            speaker_id = speaker_idx
        elif self.tts_speakers_file:
            if speaker_idx and isinstance(speaker_idx, str):
                if self.tts_config.use_d_vector_file:
                    # get the speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(
                        speaker_idx)[0]
                else:
                    # get speaker idx from the speaker name
                    try:
                        speaker_id = self.tts_model.speaker_manager.speaker_ids[
                            speaker_idx]
                    except KeyError:
                        # Interpet as int
                        speaker_id = int(speaker_idx)

            elif not speaker_idx and not speaker_wav:
                raise ValueError(
                    " [!] Look like you use a multi-speaker model. "
                    "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model."
                )
            else:
                speaker_embedding = None
        else:
            if speaker_idx:
                raise ValueError(
                    f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}."
                    "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
                )

        # compute a new d_vector from the given clip.
        if speaker_wav is not None:
            speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(
                speaker_wav)

        use_gl = self.vocoder_model is None

        for sen in sens:
            # synthesize voice
            outputs = synthesis(
                model=self.tts_model,
                text=sen,
                CONFIG=self.tts_config,
                use_cuda=self.use_cuda,
                ap=self.ap,
                speaker_id=speaker_id,
                style_wav=style_wav,
                enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars,
                use_griffin_lim=use_gl,
                d_vector=speaker_embedding,
            )
            waveform = outputs["wav"]
            mel_postnet_spec = (
                outputs["outputs"]["model_outputs"][0].detach().cpu().numpy())
            if not use_gl:
                # denormalize tts output based on tts audio config
                mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
                device_type = "cuda" if self.use_cuda else "cpu"
                # renormalize spectrogram based on vocoder config
                vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
                # compute scale factor for possible sample rate mismatch
                scale_factor = [
                    1,
                    self.vocoder_config["audio"]["sample_rate"] /
                    self.ap.sample_rate,
                ]
                if scale_factor[1] != 1:
                    print(" > interpolating tts model output.")
                    vocoder_input = interpolate_vocoder_input(
                        scale_factor, vocoder_input)
                else:
                    vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
                # run vocoder model
                # [1, T, C]
                waveform = self.vocoder_model.inference(
                    vocoder_input.to(device_type))
            if self.use_cuda and not use_gl:
                waveform = waveform.cpu()
            if not use_gl:
                waveform = waveform.numpy()
            waveform = waveform.squeeze()

            # trim silence
            waveform = trim_silence(waveform, self.ap)

            wavs += list(waveform)
            wavs += [0] * 10000

        # compute stats
        process_time = time.time() - start_time
        audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
        print(f" > Processing time: {process_time}")
        print(f" > Real-time factor: {process_time / audio_time}")
        return wavs
示例#16
0
class tts_class:
    def __init__(self):

        # Set constants
        ROOT_PATH = 'TTS/tts_model/'
        MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
        # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
        CONFIG_PATH = ROOT_PATH + '/config.json'
        OUT_FOLDER = ROOT_PATH + '/test'
        self.CONFIG = load_config(CONFIG_PATH)
        self.use_cuda = True  # True

        # load the model
        self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                              self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                                 self.CONFIG.min_level_db,
                                 self.CONFIG.frame_shift_ms,
                                 self.CONFIG.frame_length_ms,
                                 self.CONFIG.ref_level_db,
                                 self.CONFIG.num_freq, self.CONFIG.power,
                                 self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        self.model.decoder.max_decoder_steps = 500

        self.nlp = spacy.load("en")

    def process(self, text):
        self.model.decoder.max_decoder_steps = 500
        wavefiles = self.text2audio(text, self.model, self.CONFIG,
                                    self.use_cuda, self.ap)
        return wavefiles

    def tts(self, model, text, CONFIG, use_cuda, ap, wavefile, figures=True):
        waveform, alignment, spectrogram, stop_tokens = create_speech(
            model, text, CONFIG, use_cuda, ap)

        self.ap.save_wav(waveform, wavefile)

    def text2audio(self, text, model, CONFIG, use_cuda, ap):
        wavefiles = []
        base_name = "gen_{}.wav"

        doc = self.nlp(text)
        for i, sent in enumerate(doc.sents):
            text = sent.text.strip()
            wavefile = base_name.format(i)
            self.tts(model, text, CONFIG, use_cuda, ap, wavefile)
            wavefiles.append(wavefile)

        return wavefiles

    def play(self, wavefiles):

        voice = AudioSegment.empty()

        for wavefile in wavefiles:
            voice += AudioSegment.from_wav(wavefile)

        play(voice)

        for w in wavefiles:
            os.remove(w)
示例#17
0
    else:
        # check if gst_style string is a dict, if is dict convert  else use string
        try:
            gst_style = json.loads(args.gst_style)
            if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']:
                raise RuntimeError(
                    "The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}"
                    .format(max(map(int, gst_style.keys())),
                            C.gst['gst_style_tokens']))
        except ValueError:
            gst_style = args.gst_style

    wav = tts(model,
              vocoder_model,
              args.text,
              C,
              args.use_cuda,
              ap,
              use_griffin_lim,
              args.speaker_fileid,
              speaker_embedding=speaker_embedding,
              gst_style=gst_style)

    # save the results
    file_name = args.text.replace(" ", "_")[0:10]
    file_name = file_name.translate(
        str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
    out_path = os.path.join(args.out_path, file_name)
    print(" > Saving output to {}".format(out_path))
    ap.save_wav(wav, out_path)
示例#18
0
class MozillaTTS:
    """
        Wrapper for Mozilla TTS

        Related repositories:
            - Mozilla TTS:
                - https://github.com/mozilla/TTS
                - commit 824c091
                - data: https://drive.google.com/drive/folders/1FJRjGDAqWIyZRX4CsppaIPEW8UWXCWzF?usp=drive_open
            - WaveRNN(optional):
                - https://github.com/erogol/WaveRNN
                - commit 8a1c152
                - data: https://drive.google.com/drive/folders/1wpPn3a0KQc6EYtKL0qOi4NqEmhML71Ve

    """
    def __init__(self, tts_model, tts_config, wavernn_model=None, wavernn_config=None, device="cpu"):
        from TTS.utils.generic_utils import load_config
        self.tts_config = load_config(tts_config)
        self.tts_config.windowing = True
        if not torch.cuda.is_available():
            device = "cpu"
        self.use_cuda = device != "cpu"
        self.device = torch.device(device)
        self.tts_model_path = tts_model

        self._load_tts()

        if wavernn_model and wavernn_config:
            self.use_gl = False
            self.batched_wavernn = True
            self.wavernn_model_path = wavernn_model
            self.wavernn_config = load_config(wavernn_config)
            self._load_wavernn()
        else:
            self.use_gl = True

    def _load_tts(self):
        # LOAD TTS MODEL
        from TTS.utils.text.symbols import symbols, phonemes
        from TTS.utils.audio import AudioProcessor
        from TTS.utils.generic_utils import setup_model

        # load the model
        num_chars = len(phonemes) if self.tts_config.use_phonemes else len(symbols)
        self.tts_model = setup_model(num_chars, self.tts_config)

        # load the audio processor
        self._ap = AudioProcessor(**self.tts_config.audio)         

        # load model state
        cp = torch.load(self.tts_model_path, map_location=lambda storage, loc: storage)

        # load the model
        self.tts_model.load_state_dict(cp['model'])
        self.tts_model.to(self.device)
        self.tts_model.eval()
        self.tts_model.decoder.max_decoder_steps = 2000

    def _load_wavernn(self):
        from WaveRNN.models.wavernn import Model

        self.wavernn = Model(
                rnn_dims=512,
                fc_dims=512,
                mode="mold",
                pad=2,
                upsample_factors=self.wavernn_config.upsample_factors,  # set this depending on dataset
                feat_dims=self.wavernn_config.audio["num_mels"],
                compute_dims=128,
                res_out_dims=128,
                res_blocks=10,
                hop_length=self._ap.hop_length,
                sample_rate=self._ap.sample_rate,
            ).to(self.device)

        check = torch.load(self.wavernn_model_path)
        self.wavernn.load_state_dict(check['model'])
        self.wavernn.eval()

    def __call__(self, text, out_path):
        waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(self.tts_model, text, self.tts_config, self.use_cuda, self._ap, False, self.tts_config.enable_eos_bos_chars)
        if not self.use_gl:
            waveform = self.wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).to(self.device), batched=self.batched_wavernn, target=11000, overlap=550)

        self._ap.save_wav(waveform, out_path)