示例#1
0
    def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None):

        super().__init__(config)

        self.speaker_manager = speaker_manager
        self.init_multispeaker(config)

        self.max_duration = self.args.max_duration
        self.use_aligner = self.args.use_aligner
        self.use_pitch = self.args.use_pitch
        self.use_binary_alignment_loss = False

        self.length_scale = (float(self.args.length_scale) if isinstance(
            self.args.length_scale, int) else self.args.length_scale)

        self.emb = nn.Embedding(self.args.num_chars, self.args.hidden_channels)

        self.encoder = Encoder(
            self.args.hidden_channels,
            self.args.hidden_channels,
            self.args.encoder_type,
            self.args.encoder_params,
            self.embedded_speaker_dim,
        )

        if self.args.positional_encoding:
            self.pos_encoder = PositionalEncoding(self.args.hidden_channels)

        self.decoder = Decoder(
            self.args.out_channels,
            self.args.hidden_channels,
            self.args.decoder_type,
            self.args.decoder_params,
        )

        self.duration_predictor = DurationPredictor(
            self.args.hidden_channels + self.embedded_speaker_dim,
            self.args.duration_predictor_hidden_channels,
            self.args.duration_predictor_kernel_size,
            self.args.duration_predictor_dropout_p,
        )

        if self.args.use_pitch:
            self.pitch_predictor = DurationPredictor(
                self.args.hidden_channels + self.embedded_speaker_dim,
                self.args.pitch_predictor_hidden_channels,
                self.args.pitch_predictor_kernel_size,
                self.args.pitch_predictor_dropout_p,
            )
            self.pitch_emb = nn.Conv1d(
                1,
                self.args.hidden_channels,
                kernel_size=self.args.pitch_embedding_kernel_size,
                padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
            )

        if self.args.use_aligner:
            self.aligner = AlignmentNetwork(
                in_query_channels=self.args.out_channels,
                in_key_channels=self.args.hidden_channels)
示例#2
0
def test_encoder():
    input_dummy = torch.rand(8, 14, 37).to(device)
    input_lengths = torch.randint(31, 37, (8, )).long().to(device)
    input_lengths[-1] = 37
    input_mask = torch.unsqueeze(
        sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
    # relative positional transformer encoder
    layer = Encoder(out_channels=11,
                    in_hidden_channels=14,
                    encoder_type='relative_position_transformer',
                    encoder_params={
                        'hidden_channels_ffn': 768,
                        'num_heads': 2,
                        "kernel_size": 3,
                        "dropout_p": 0.1,
                        "num_layers": 6,
                        "rel_attn_window_size": 4,
                        "input_length": None
                    }).to(device)
    output = layer(input_dummy, input_mask)
    assert list(output.shape) == [8, 11, 37]
    # residual conv bn encoder
    layer = Encoder(out_channels=11,
                    in_hidden_channels=14,
                    encoder_type='residual_conv_bn',
                    encoder_params={
                        "kernel_size": 4,
                        "dilations": 4 * [1, 2, 4] + [1],
                        "num_conv_blocks": 2,
                        "num_res_blocks": 13
                    }).to(device)
    output = layer(input_dummy, input_mask)
    assert list(output.shape) == [8, 11, 37]
    # FFTransformer encoder
    layer = Encoder(out_channels=14,
                    in_hidden_channels=14,
                    encoder_type='fftransformer',
                    encoder_params={
                        "hidden_channels_ffn": 31,
                        "num_heads": 2,
                        "num_layers": 2,
                        "dropout_p": 0.1
                    }).to(device)
    output = layer(input_dummy, input_mask)
    assert list(output.shape) == [8, 14, 37]
示例#3
0
    def __init__(
        self,
        num_chars,
        out_channels,
        hidden_channels=256,
        hidden_channels_dp=256,
        encoder_type="fftransformer",
        encoder_params={
            "hidden_channels_ffn": 1024,
            "num_heads": 2,
            "num_layers": 6,
            "dropout_p": 0.1
        },
        decoder_type="fftransformer",
        decoder_params={
            "hidden_channels_ffn": 1024,
            "num_heads": 2,
            "num_layers": 6,
            "dropout_p": 0.1
        },
        length_scale=1,
        num_speakers=0,
        external_c=False,
        c_in_channels=0,
    ):

        super().__init__()
        self.length_scale = float(length_scale) if isinstance(
            length_scale, int) else length_scale
        self.emb = nn.Embedding(num_chars, hidden_channels)
        self.pos_encoder = PositionalEncoding(hidden_channels)
        self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
                               encoder_params, c_in_channels)
        self.decoder = Decoder(out_channels, hidden_channels, decoder_type,
                               decoder_params)
        self.duration_predictor = DurationPredictor(hidden_channels_dp)

        self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1)
        self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels)

        if num_speakers > 1 and not external_c:
            # speaker embedding layer
            self.emb_g = nn.Embedding(num_speakers, c_in_channels)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)

        if c_in_channels > 0 and c_in_channels != hidden_channels:
            self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
示例#4
0
    def __init__(
        self,
        num_chars,
        out_channels,
        hidden_channels,
        positional_encoding=True,
        length_scale=1,
        encoder_type="residual_conv_bn",
        encoder_params={
            "kernel_size": 4,
            "dilations": 4 * [1, 2, 4] + [1],
            "num_conv_blocks": 2,
            "num_res_blocks": 13
        },
        decoder_type="residual_conv_bn",
        decoder_params={
            "kernel_size": 4,
            "dilations": 4 * [1, 2, 4, 8] + [1],
            "num_conv_blocks": 2,
            "num_res_blocks": 17,
        },
        num_speakers=0,
        external_c=False,
        c_in_channels=0,
    ):

        super().__init__()
        self.length_scale = float(length_scale) if isinstance(
            length_scale, int) else length_scale
        self.emb = nn.Embedding(num_chars, hidden_channels)
        self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
                               encoder_params, c_in_channels)
        if positional_encoding:
            self.pos_encoder = PositionalEncoding(hidden_channels)
        self.decoder = Decoder(out_channels, hidden_channels, decoder_type,
                               decoder_params)
        self.duration_predictor = DurationPredictor(hidden_channels +
                                                    c_in_channels)

        if num_speakers > 1 and not external_c:
            # speaker embedding layer
            self.emb_g = nn.Embedding(num_speakers, c_in_channels)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)

        if c_in_channels > 0 and c_in_channels != hidden_channels:
            self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
示例#5
0
    def __init__(self, config: Coqpit):

        super().__init__()
        self.config = config
        self.phase = -1
        self.length_scale = (float(config.model_args.length_scale)
                             if isinstance(config.model_args.length_scale, int)
                             else config.model_args.length_scale)

        if not self.config.model_args.num_chars:
            _, self.config, num_chars = self.get_characters(config)
            self.config.model_args.num_chars = num_chars

        self.emb = nn.Embedding(self.config.model_args.num_chars,
                                self.config.model_args.hidden_channels)

        self.embedded_speaker_dim = 0
        self.init_multispeaker(config)

        self.pos_encoder = PositionalEncoding(
            config.model_args.hidden_channels)
        self.encoder = Encoder(
            config.model_args.hidden_channels,
            config.model_args.hidden_channels,
            config.model_args.encoder_type,
            config.model_args.encoder_params,
            self.embedded_speaker_dim,
        )
        self.decoder = Decoder(
            config.model_args.out_channels,
            config.model_args.hidden_channels,
            config.model_args.decoder_type,
            config.model_args.decoder_params,
        )
        self.duration_predictor = DurationPredictor(
            config.model_args.hidden_channels_dp)

        self.mod_layer = nn.Conv1d(config.model_args.hidden_channels,
                                   config.model_args.hidden_channels, 1)

        self.mdn_block = MDNBlock(config.model_args.hidden_channels,
                                  2 * config.model_args.out_channels)

        if self.embedded_speaker_dim > 0 and self.embedded_speaker_dim != config.model_args.hidden_channels:
            self.proj_g = nn.Conv1d(self.embedded_speaker_dim,
                                    config.model_args.hidden_channels, 1)
示例#6
0
    def __init__(self, config: Coqpit):

        super().__init__()

        # don't use isintance not to import recursively
        if "Config" in config.__class__.__name__:
            if "characters" in config:
                # loading from FasrPitchConfig
                _, self.config, num_chars = self.get_characters(config)
                config.model_args.num_chars = num_chars
                self.args = self.config.model_args
            else:
                # loading from ForwardTTSArgs
                self.config = config
                self.args = config.model_args
        elif isinstance(config, ForwardTTSArgs):
            self.args = config
            self.config = config
        else:
            raise ValueError(
                "config must be either a *Config or ForwardTTSArgs")

        self.max_duration = self.args.max_duration
        self.use_aligner = self.args.use_aligner
        self.use_pitch = self.args.use_pitch
        self.use_binary_alignment_loss = False

        self.length_scale = (float(self.args.length_scale) if isinstance(
            self.args.length_scale, int) else self.args.length_scale)

        self.emb = nn.Embedding(self.args.num_chars, self.args.hidden_channels)

        self.encoder = Encoder(
            self.args.hidden_channels,
            self.args.hidden_channels,
            self.args.encoder_type,
            self.args.encoder_params,
            self.args.d_vector_dim,
        )

        if self.args.positional_encoding:
            self.pos_encoder = PositionalEncoding(self.args.hidden_channels)

        self.decoder = Decoder(
            self.args.out_channels,
            self.args.hidden_channels,
            self.args.decoder_type,
            self.args.decoder_params,
        )

        self.duration_predictor = DurationPredictor(
            self.args.hidden_channels + self.args.d_vector_dim,
            self.args.duration_predictor_hidden_channels,
            self.args.duration_predictor_kernel_size,
            self.args.duration_predictor_dropout_p,
        )

        if self.args.use_pitch:
            self.pitch_predictor = DurationPredictor(
                self.args.hidden_channels + self.args.d_vector_dim,
                self.args.pitch_predictor_hidden_channels,
                self.args.pitch_predictor_kernel_size,
                self.args.pitch_predictor_dropout_p,
            )
            self.pitch_emb = nn.Conv1d(
                1,
                self.args.hidden_channels,
                kernel_size=self.args.pitch_embedding_kernel_size,
                padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
            )

        if self.args.num_speakers > 1 and not self.args.use_d_vector:
            # speaker embedding layer
            self.emb_g = nn.Embedding(self.args.num_speakers,
                                      self.args.d_vector_dim)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)

        if self.args.d_vector_dim > 0 and self.args.d_vector_dim != self.args.hidden_channels:
            self.proj_g = nn.Conv1d(self.args.d_vector_dim,
                                    self.args.hidden_channels, 1)

        if self.args.use_aligner:
            self.aligner = AlignmentNetwork(
                in_query_channels=self.args.out_channels,
                in_key_channels=self.args.hidden_channels)