示例#1
0
    def __init__(self, config: GlowTTSConfig):

        super().__init__()

        # pass all config fields to `self`
        # for fewer code change
        self.config = config
        for key in config:
            setattr(self, key, config[key])

        _, self.config, self.num_chars = self.get_characters(config)
        self.decoder_output_dim = config.out_channels

        self.init_multispeaker(config)

        # if is a multispeaker and c_in_channels is 0, set to 256
        self.c_in_channels = 0
        if self.num_speakers > 1:
            if self.d_vector_dim:
                self.c_in_channels = self.d_vector_dim
            elif self.c_in_channels == 0 and not self.d_vector_dim:
                # TODO: make this adjustable
                self.c_in_channels = 256

        self.run_data_dep_init = config.data_dep_init_steps > 0

        self.encoder = Encoder(
            self.num_chars,
            out_channels=self.out_channels,
            hidden_channels=self.hidden_channels_enc,
            hidden_channels_dp=self.hidden_channels_dp,
            encoder_type=self.encoder_type,
            encoder_params=self.encoder_params,
            mean_only=self.mean_only,
            use_prenet=self.use_encoder_prenet,
            dropout_p_dp=self.dropout_p_dp,
            c_in_channels=self.c_in_channels,
        )

        self.decoder = Decoder(
            self.out_channels,
            self.hidden_channels_dec,
            self.kernel_size_dec,
            self.dilation_rate,
            self.num_flow_blocks_dec,
            self.num_block_layers,
            dropout_p=self.dropout_p_dec,
            num_splits=self.num_splits,
            num_squeeze=self.num_squeeze,
            sigmoid_scale=self.sigmoid_scale,
            c_in_channels=self.c_in_channels,
        )
示例#2
0
文件: glow_tts.py 项目: coqui-ai/TTS
    def __init__(
        self,
        config: GlowTTSConfig,
        ap: "AudioProcessor" = None,
        tokenizer: "TTSTokenizer" = None,
        speaker_manager: SpeakerManager = None,
    ):

        super().__init__(config, ap, tokenizer, speaker_manager)

        # pass all config fields to `self`
        # for fewer code change
        self.config = config
        for key in config:
            setattr(self, key, config[key])

        self.decoder_output_dim = config.out_channels

        # init multi-speaker layers if necessary
        self.init_multispeaker(config)

        self.run_data_dep_init = config.data_dep_init_steps > 0
        self.encoder = Encoder(
            self.num_chars,
            out_channels=self.out_channels,
            hidden_channels=self.hidden_channels_enc,
            hidden_channels_dp=self.hidden_channels_dp,
            encoder_type=self.encoder_type,
            encoder_params=self.encoder_params,
            mean_only=self.mean_only,
            use_prenet=self.use_encoder_prenet,
            dropout_p_dp=self.dropout_p_dp,
            c_in_channels=self.c_in_channels,
        )

        self.decoder = Decoder(
            self.out_channels,
            self.hidden_channels_dec,
            self.kernel_size_dec,
            self.dilation_rate,
            self.num_flow_blocks_dec,
            self.num_block_layers,
            dropout_p=self.dropout_p_dec,
            num_splits=self.num_splits,
            num_squeeze=self.num_squeeze,
            sigmoid_scale=self.sigmoid_scale,
            c_in_channels=self.c_in_channels,
        )
示例#3
0
    def __init__(
        self,
        num_chars,
        hidden_channels_enc,
        hidden_channels_dec,
        use_encoder_prenet,
        hidden_channels_dp,
        out_channels,
        num_flow_blocks_dec=12,
        inference_noise_scale=0.33,
        kernel_size_dec=5,
        dilation_rate=5,
        num_block_layers=4,
        dropout_p_dp=0.1,
        dropout_p_dec=0.05,
        num_speakers=0,
        c_in_channels=0,
        num_splits=4,
        num_squeeze=1,
        sigmoid_scale=False,
        mean_only=False,
        encoder_type="transformer",
        encoder_params=None,
        speaker_embedding_dim=None,
    ):

        super().__init__()
        self.num_chars = num_chars
        self.hidden_channels_dp = hidden_channels_dp
        self.hidden_channels_enc = hidden_channels_enc
        self.hidden_channels_dec = hidden_channels_dec
        self.out_channels = out_channels
        self.num_flow_blocks_dec = num_flow_blocks_dec
        self.kernel_size_dec = kernel_size_dec
        self.dilation_rate = dilation_rate
        self.num_block_layers = num_block_layers
        self.dropout_p_dec = dropout_p_dec
        self.num_speakers = num_speakers
        self.c_in_channels = c_in_channels
        self.num_splits = num_splits
        self.num_squeeze = num_squeeze
        self.sigmoid_scale = sigmoid_scale
        self.mean_only = mean_only
        self.use_encoder_prenet = use_encoder_prenet
        self.inference_noise_scale = inference_noise_scale

        # model constants.
        self.noise_scale = 0.33  # defines the noise variance applied to the random z vector at inference.
        self.length_scale = 1.0  # scaler for the duration predictor. The larger it is, the slower the speech.
        self.speaker_embedding_dim = speaker_embedding_dim

        # if is a multispeaker and c_in_channels is 0, set to 256
        if num_speakers > 1:
            if self.c_in_channels == 0 and not self.speaker_embedding_dim:
                # TODO: make this adjustable
                self.c_in_channels = 256
            elif self.speaker_embedding_dim:
                self.c_in_channels = self.speaker_embedding_dim

        self.encoder = Encoder(
            num_chars,
            out_channels=out_channels,
            hidden_channels=hidden_channels_enc,
            hidden_channels_dp=hidden_channels_dp,
            encoder_type=encoder_type,
            encoder_params=encoder_params,
            mean_only=mean_only,
            use_prenet=use_encoder_prenet,
            dropout_p_dp=dropout_p_dp,
            c_in_channels=self.c_in_channels,
        )

        self.decoder = Decoder(
            out_channels,
            hidden_channels_dec,
            kernel_size_dec,
            dilation_rate,
            num_flow_blocks_dec,
            num_block_layers,
            dropout_p=dropout_p_dec,
            num_splits=num_splits,
            num_squeeze=num_squeeze,
            sigmoid_scale=sigmoid_scale,
            c_in_channels=self.c_in_channels,
        )

        if num_speakers > 1 and not speaker_embedding_dim:
            # speaker embedding layer
            self.emb_g = nn.Embedding(num_speakers, self.c_in_channels)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
示例#4
0
    def __init__(self,
                 num_chars,
                 hidden_channels,
                 filter_channels,
                 filter_channels_dp,
                 out_channels,
                 kernel_size=3,
                 num_heads=2,
                 num_layers_enc=6,
                 dropout_p=0.1,
                 num_flow_blocks_dec=12,
                 kernel_size_dec=5,
                 dilation_rate=5,
                 num_block_layers=4,
                 dropout_p_dec=0.,
                 num_speakers=0,
                 c_in_channels=0,
                 num_splits=4,
                 num_sqz=1,
                 sigmoid_scale=False,
                 rel_attn_window_size=None,
                 input_length=None,
                 mean_only=False,
                 hidden_channels_enc=None,
                 hidden_channels_dec=None,
                 use_encoder_prenet=False,
                 encoder_type="transformer"):

        super().__init__()
        self.num_chars = num_chars
        self.hidden_channels = hidden_channels
        self.filter_channels = filter_channels
        self.filter_channels_dp = filter_channels_dp
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.num_heads = num_heads
        self.num_layers_enc = num_layers_enc
        self.dropout_p = dropout_p
        self.num_flow_blocks_dec = num_flow_blocks_dec
        self.kernel_size_dec = kernel_size_dec
        self.dilation_rate = dilation_rate
        self.num_block_layers = num_block_layers
        self.dropout_p_dec = dropout_p_dec
        self.num_speakers = num_speakers
        self.c_in_channels = c_in_channels
        self.num_splits = num_splits
        self.num_sqz = num_sqz
        self.sigmoid_scale = sigmoid_scale
        self.rel_attn_window_size = rel_attn_window_size
        self.input_length = input_length
        self.mean_only = mean_only
        self.hidden_channels_enc = hidden_channels_enc
        self.hidden_channels_dec = hidden_channels_dec
        self.use_encoder_prenet = use_encoder_prenet
        self.noise_scale = 0.66
        self.length_scale = 1.

        self.encoder = Encoder(num_chars,
                               out_channels=out_channels,
                               hidden_channels=hidden_channels,
                               filter_channels=filter_channels,
                               filter_channels_dp=filter_channels_dp,
                               encoder_type=encoder_type,
                               num_heads=num_heads,
                               num_layers=num_layers_enc,
                               kernel_size=kernel_size,
                               dropout_p=dropout_p,
                               mean_only=mean_only,
                               use_prenet=use_encoder_prenet,
                               c_in_channels=c_in_channels)

        self.decoder = Decoder(out_channels,
                               hidden_channels_dec or hidden_channels,
                               kernel_size_dec,
                               dilation_rate,
                               num_flow_blocks_dec,
                               num_block_layers,
                               dropout_p=dropout_p_dec,
                               num_splits=num_splits,
                               num_sqz=num_sqz,
                               sigmoid_scale=sigmoid_scale,
                               c_in_channels=c_in_channels)

        if num_speakers > 1:
            self.emb_g = nn.Embedding(num_speakers, c_in_channels)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)