class SpeakerEncoderConfig(BaseTrainingConfig):
    """Defines parameters for Speaker Encoder model."""

    model: str = "speaker_encoder"
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    datasets: List[BaseDatasetConfig] = field(
        default_factory=lambda: [BaseDatasetConfig()])
    # model params
    model_params: Dict = field(
        default_factory=lambda: {
            "model_name": "lstm",
            "input_dim": 80,
            "proj_dim": 256,
            "lstm_dim": 768,
            "num_lstm_layers": 3,
            "use_lstm_with_projection": True,
        })

    audio_augmentation: Dict = field(default_factory=lambda: {})

    storage: Dict = field(
        default_factory=lambda: {
            "sample_from_storage_p":
            0.66,  # the probability with which we'll sample from the DataSet in-memory storage
            "storage_size":
            15,  # the size of the in-memory storage with respect to a single batch
        })

    # training params
    max_train_step: int = 1000000  # end training when number of training steps reaches this value.
    loss: str = "angleproto"
    grad_clip: float = 3.0
    lr: float = 0.0001
    lr_decay: bool = False
    warmup_steps: int = 4000
    wd: float = 1e-6

    # logging params
    tb_model_param_stats: bool = False
    steps_plot_stats: int = 10
    checkpoint: bool = True
    save_step: int = 1000
    print_step: int = 20

    # data loader
    num_speakers_in_batch: int = MISSING
    num_utters_per_speaker: int = MISSING
    num_loader_workers: int = MISSING
    skip_speakers: bool = False
    voice_len: float = 1.6

    def check_values(self):
        super().check_values()
        c = asdict(self)
        assert (
            c["model_params"]["input_dim"] == self.audio.num_mels
        ), " [!] model input dimendion must be equal to melspectrogram dimension."
Exemplo n.º 2
0
class BaseEncoderConfig(BaseTrainingConfig):
    """Defines parameters for a Generic Encoder model."""

    model: str = None
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
    # model params
    model_params: Dict = field(
        default_factory=lambda: {
            "model_name": "lstm",
            "input_dim": 80,
            "proj_dim": 256,
            "lstm_dim": 768,
            "num_lstm_layers": 3,
            "use_lstm_with_projection": True,
        }
    )

    audio_augmentation: Dict = field(default_factory=lambda: {})

    # training params
    epochs: int = 10000
    loss: str = "angleproto"
    grad_clip: float = 3.0
    lr: float = 0.0001
    optimizer: str = "radam"
    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
    lr_decay: bool = False
    warmup_steps: int = 4000

    # logging params
    tb_model_param_stats: bool = False
    steps_plot_stats: int = 10
    save_step: int = 1000
    print_step: int = 20
    run_eval: bool = False

    # data loader
    num_classes_in_batch: int = MISSING
    num_utter_per_class: int = MISSING
    eval_num_classes_in_batch: int = None
    eval_num_utter_per_class: int = None

    num_loader_workers: int = MISSING
    voice_len: float = 1.6

    def check_values(self):
        super().check_values()
        c = asdict(self)
        assert (
            c["model_params"]["input_dim"] == self.audio.num_mels
        ), " [!] model input dimendion must be equal to melspectrogram dimension."
Exemplo n.º 3
0
import unittest

import torch

from tests import get_tests_output_path, run_cli
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig

torch.manual_seed(1)

config_path = os.path.join(get_tests_output_path(), "test_model_config.json")

dataset_config_en = BaseDatasetConfig(
    name="ljspeech",
    meta_file_train="metadata.csv",
    meta_file_val="metadata.csv",
    path="tests/data/ljspeech",
    language="en",
)

dataset_config_pt = BaseDatasetConfig(
    name="ljspeech",
    meta_file_train="metadata.csv",
    meta_file_val="metadata.csv",
    path="tests/data/ljspeech",
    language="pt-br",
)


# pylint: disable=protected-access
class TestFindUniquePhonemes(unittest.TestCase):
Exemplo n.º 4
0
import os

from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.fast_pitch_config import FastPitchConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.utils.audio import AudioProcessor
from TTS.utils.manage import ModelManager

output_path = os.path.dirname(os.path.abspath(__file__))

# init configs
dataset_config = BaseDatasetConfig(
    name="ljspeech",
    meta_file_train="metadata.csv",
    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
)

audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)