예제 #1
0
    def __init__(self,
                 device,
                 sample_size,
                 padding_index,
                 ntokens=5,
                 embed_dim=512,
                 noise_std=0.1,
                 dropout=0.1):
        """
        number of latent-space tokens is constant.
        """
        super().__init__(None)
        # self.device = device
        self.dropout = dropout
        self.dim = embed_dim
        self.ntokens = ntokens

        self.content_embeddings = Embedding(
            sample_size, embed_dim * ntokens,
            padding_index)  # tokens-encoder, sample-specific
        # self.content_embeddings = LargeEmbedding(sample_size, embed_dim * ntokens, page_size=1024, num_devices=1, use_cuda=True) # tokens-encoder, sample-specific

        self.negative_embedding = PositionalEmbedding(num_embeddings=ntokens +
                                                      1,
                                                      embedding_dim=embed_dim,
                                                      padding_idx=0)

        self.positive_embedding = PositionalEmbedding(num_embeddings=ntokens +
                                                      1,
                                                      embedding_dim=embed_dim,
                                                      padding_idx=0)

        self.noise = Normal(loc=0.0, scale=noise_std)
예제 #2
0
    def __init__(self,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_embed_dim,
                 padding_idx=None,
                 convolutions=((64, 5, .1), ) * 7,
                 max_positions=512,
                 dropout=0.1):
        super(Encoder, self).__init__()
        self.dropout = dropout
        self.num_attention_layers = None

        # Text input embeddings
        self.embed_tokens = Embedding(n_vocab, embed_dim, padding_idx)

        # Text position embedding
        self.embed_text_positions = Embedding(max_positions, embed_dim,
                                              padding_idx)
        self.embed_text_positions.weight.data = position_encoding_init(
            max_positions, embed_dim)

        # Speaker embedding
        if n_speakers > 1:
            self.speaker_fc1 = Linear(speaker_embed_dim, embed_dim)
            self.speaker_fc2 = Linear(speaker_embed_dim, embed_dim)
        self.n_speakers = n_speakers

        # Non-causual convolutions
        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.speaker_projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()

        Conv1dLayer = Conv1d if has_dilation(convolutions) else ConvTBC

        for (out_channels, kernel_size, dilation) in convolutions:
            pad = (kernel_size - 1) // 2 * dilation
            dilation = (dilation, )
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.speaker_projections.append(
                Linear(speaker_embed_dim, out_channels
                       ) if n_speakers > 1 else None)
            self.convolutions.append(
                Conv1dLayer(in_channels,
                            out_channels * 2,
                            kernel_size,
                            padding=pad,
                            dilation=dilation,
                            dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)
예제 #3
0
    def __init__(self,
                 encoder,
                 decoder,
                 converter,
                 mel_dim=80,
                 linear_dim=4096,
                 n_speakers=1,
                 speaker_embed_dim=16,
                 padding_idx=None,
                 trainable_positional_encodings=False):
        super(DeepVoice3, self).__init__()
        self.mel_dim = mel_dim
        self.linear_dim = linear_dim
        self.trainable_positional_encodings = trainable_positional_encodings

        self.encoder = encoder
        self.decoder = decoder
        self.converter = converter
        self.encoder.num_attention_layers = sum(
            [layer is not None for layer in decoder.attention])

        # Speaker embedding
        if n_speakers > 1:
            self.embed_speakers = Embedding(n_speakers, speaker_embed_dim,
                                            padding_idx)
        self.n_speakers = n_speakers
        self.speaker_embed_dim = speaker_embed_dim

        self.use_text_pos_embedding_in_encoder = False
예제 #4
0
    def __init__(self,
                 device,
                 sample_size,
                 padding_index,
                 ntokens=5,
                 embed_dim=512,
                 noise_std=0.1,
                 dropout=0.1):
        """
        number of latent-space tokens is constant.
        """
        super().__init__(None)
        # self.device = device
        self.dropout = dropout
        self.dim = embed_dim
        self.ntokens = ntokens

        self.content_embeddings = Embedding(
            sample_size, embed_dim * ntokens,
            padding_index)  # tokens-encoder, sample-specific

        self.sentiment_embedding = torch.nn.Embedding(num_embeddings=20,
                                                      embedding_dim=embed_dim *
                                                      ntokens)
        self.sentiment_embeddings_flags = torch.nn.Embedding(
            num_embeddings=sample_size, embedding_dim=20)
        for p in self.sentiment_embeddings_flags.parameters():
            torch.nn.init.uniform_(p, a=0.5, b=1.0)

        self.noise = Normal(loc=0.0, scale=noise_std)
예제 #5
0
    def __init__(self,
                 device,
                 sample_size,
                 padding_index,
                 ntokens=5,
                 embed_dim=512,
                 noise_std=0.1,
                 dropout=0.1,
                 page_size=2**14):
        """
        number of latent-space tokens is constant.
        """
        super().__init__(None)
        # self.device = device
        self.number_of_partitions = math.ceil(sample_size / page_size)
        self.active_partition = -1
        self.page_size = page_size
        self.dropout = dropout
        self.dim = embed_dim
        self.ntokens = ntokens

        self.content_embeddings = nn.ModuleList([
            Embedding(page_size, embed_dim * ntokens, padding_index)
            for i in range(self.number_of_partitions)
        ])

        self.negative_embedding = PositionalEmbedding(num_embeddings=ntokens +
                                                      1,
                                                      embedding_dim=embed_dim,
                                                      padding_idx=0)

        self.positive_embedding = PositionalEmbedding(num_embeddings=ntokens +
                                                      1,
                                                      embedding_dim=embed_dim,
                                                      padding_idx=0)

        self.noise = Normal(loc=0.0, scale=noise_std)
예제 #6
0
class NoEncoder(FairseqEncoder):
    """
    The input contain:
        sequence of latent embedding indecies
        class index (positive / negative)
        embed the input and noise the sample embeddings.
    """
    def __init__(self,
                 device,
                 sample_size,
                 padding_index,
                 ntokens=5,
                 embed_dim=512,
                 noise_std=0.1,
                 dropout=0.1):
        """
        number of latent-space tokens is constant.
        """
        super().__init__(None)
        # self.device = device
        self.dropout = dropout
        self.dim = embed_dim
        self.ntokens = ntokens

        self.content_embeddings = Embedding(
            sample_size, embed_dim * ntokens,
            padding_index)  # tokens-encoder, sample-specific
        # self.content_embeddings = LargeEmbedding(sample_size, embed_dim * ntokens, page_size=1024, num_devices=1, use_cuda=True) # tokens-encoder, sample-specific

        self.negative_embedding = PositionalEmbedding(num_embeddings=ntokens +
                                                      1,
                                                      embedding_dim=embed_dim,
                                                      padding_idx=0)

        self.positive_embedding = PositionalEmbedding(num_embeddings=ntokens +
                                                      1,
                                                      embedding_dim=embed_dim,
                                                      padding_idx=0)

        self.noise = Normal(loc=0.0, scale=noise_std)

    def get_active_parameters(self):
        return [p for p in self.positive_embedding.parameters()] + \
               [p for p in self.negative_embedding.parameters()] + \
               [p for p in self.content_embeddings.parameters()]

    def forward(self, src_tokens, src_lengths):
        """
        src_tokens are two: one for the sentiment (0 or 1),
                            and one for the sample [0.. sample_size]
                            shape is always (batch, 2)
        src_lengths is (batch)-size array full of 2.
        """
        def foo(emb_layer):
            for param in emb_layer.parameters():
                print(param.shape)
                print(param)
                return param

        batch_size = src_tokens.size()[0]

        # content embedding and noise
        content = self.content_embeddings(src_tokens[:, 0])
        content = content.view(batch_size, self.ntokens, self.dim)
        content = content + self.noise.sample(sample_shape=content.size()).to(
            content.device)

        # sentiment positional embedding
        positions = torch.arange(1, self.ntokens + 1).unsqueeze(0).to(
            content.device)  # 1 x ntokens
        sentiment = src_tokens[:, 1].unsqueeze(1).unsqueeze(2)  # batch x 1 x 1

        sentiment = self.positive_embedding(positions) * sentiment + \
                    self.negative_embedding(positions) * (torch.tensor(1) - sentiment) # batch x ntokens x dim

        x = content + sentiment
        x = F.dropout(x, p=self.dropout, training=self.training)

        return {'encoder_out': (x, x), 'encoder_padding_mask': None}
예제 #7
0
    def __init__(
        self,
        embed_dim,
        n_speakers,
        speaker_embed_dim,
        in_dim=80,
        r=5,
        max_positions=512,
        padding_idx=None,
        convolutions=((128, 5, 1), ) * 4,
        attention=True,
        dropout=0.1,
        use_memory_mask=False,
        force_monotonic_attention=True,
        query_position_rate=1.0,
        key_position_rate=1.29,
    ):
        super(Decoder, self).__init__()
        self.dropout = dropout
        self.in_dim = in_dim
        self.r = r

        in_channels = in_dim * r
        if isinstance(attention, bool):
            # expand True into [True, True, ...] and do the same with False
            attention = [attention] * len(convolutions)

        # Position encodings for query (decoder states) and keys (encoder states)
        self.embed_query_positions = Embedding(max_positions,
                                               convolutions[0][0], padding_idx)
        self.embed_query_positions.weight.data = position_encoding_init(
            max_positions,
            convolutions[0][0],
            position_rate=query_position_rate)
        self.embed_keys_positions = Embedding(max_positions, embed_dim,
                                              padding_idx)
        self.embed_keys_positions.weight.data = position_encoding_init(
            max_positions, embed_dim, position_rate=key_position_rate)

        self.fc1 = Linear(in_channels, convolutions[0][0], dropout=dropout)
        in_channels = convolutions[0][0]

        # Causual convolutions
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()

        Conv1dLayer = Conv1d if has_dilation(
            convolutions) else LinearizedConv1d

        for i, (out_channels, kernel_size,
                dilation) in enumerate(convolutions):
            pad = (kernel_size - 1) * dilation
            dilation = (dilation, )
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                Conv1dLayer(in_channels,
                            out_channels * 2,
                            kernel_size,
                            padding=pad,
                            dilation=dilation,
                            dropout=dropout))
            self.attention.append(
                AttentionLayer(out_channels, embed_dim, dropout=dropout
                               ) if attention[i] else None)
            in_channels = out_channels
        self.fc2 = Linear(in_channels, in_dim * r)

        # decoder states -> Done binary flag
        self.fc3 = Linear(in_channels, 1)

        self._is_inference_incremental = False
        self.max_decoder_steps = 200
        self.min_decoder_steps = 10
        self.use_memory_mask = use_memory_mask
        if isinstance(force_monotonic_attention, bool):
            self.force_monotonic_attention = \
                [force_monotonic_attention] * len(convolutions)
        else:
            self.force_monotonic_attention = force_monotonic_attention