def __init__(self, device, sample_size, padding_index, ntokens=5, embed_dim=512, noise_std=0.1, dropout=0.1): """ number of latent-space tokens is constant. """ super().__init__(None) # self.device = device self.dropout = dropout self.dim = embed_dim self.ntokens = ntokens self.content_embeddings = Embedding( sample_size, embed_dim * ntokens, padding_index) # tokens-encoder, sample-specific # self.content_embeddings = LargeEmbedding(sample_size, embed_dim * ntokens, page_size=1024, num_devices=1, use_cuda=True) # tokens-encoder, sample-specific self.negative_embedding = PositionalEmbedding(num_embeddings=ntokens + 1, embedding_dim=embed_dim, padding_idx=0) self.positive_embedding = PositionalEmbedding(num_embeddings=ntokens + 1, embedding_dim=embed_dim, padding_idx=0) self.noise = Normal(loc=0.0, scale=noise_std)
def __init__(self, n_vocab, embed_dim, n_speakers, speaker_embed_dim, padding_idx=None, convolutions=((64, 5, .1), ) * 7, max_positions=512, dropout=0.1): super(Encoder, self).__init__() self.dropout = dropout self.num_attention_layers = None # Text input embeddings self.embed_tokens = Embedding(n_vocab, embed_dim, padding_idx) # Text position embedding self.embed_text_positions = Embedding(max_positions, embed_dim, padding_idx) self.embed_text_positions.weight.data = position_encoding_init( max_positions, embed_dim) # Speaker embedding if n_speakers > 1: self.speaker_fc1 = Linear(speaker_embed_dim, embed_dim) self.speaker_fc2 = Linear(speaker_embed_dim, embed_dim) self.n_speakers = n_speakers # Non-causual convolutions in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.speaker_projections = nn.ModuleList() self.convolutions = nn.ModuleList() Conv1dLayer = Conv1d if has_dilation(convolutions) else ConvTBC for (out_channels, kernel_size, dilation) in convolutions: pad = (kernel_size - 1) // 2 * dilation dilation = (dilation, ) self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.speaker_projections.append( Linear(speaker_embed_dim, out_channels ) if n_speakers > 1 else None) self.convolutions.append( Conv1dLayer(in_channels, out_channels * 2, kernel_size, padding=pad, dilation=dilation, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def __init__(self, encoder, decoder, converter, mel_dim=80, linear_dim=4096, n_speakers=1, speaker_embed_dim=16, padding_idx=None, trainable_positional_encodings=False): super(DeepVoice3, self).__init__() self.mel_dim = mel_dim self.linear_dim = linear_dim self.trainable_positional_encodings = trainable_positional_encodings self.encoder = encoder self.decoder = decoder self.converter = converter self.encoder.num_attention_layers = sum( [layer is not None for layer in decoder.attention]) # Speaker embedding if n_speakers > 1: self.embed_speakers = Embedding(n_speakers, speaker_embed_dim, padding_idx) self.n_speakers = n_speakers self.speaker_embed_dim = speaker_embed_dim self.use_text_pos_embedding_in_encoder = False
def __init__(self, device, sample_size, padding_index, ntokens=5, embed_dim=512, noise_std=0.1, dropout=0.1): """ number of latent-space tokens is constant. """ super().__init__(None) # self.device = device self.dropout = dropout self.dim = embed_dim self.ntokens = ntokens self.content_embeddings = Embedding( sample_size, embed_dim * ntokens, padding_index) # tokens-encoder, sample-specific self.sentiment_embedding = torch.nn.Embedding(num_embeddings=20, embedding_dim=embed_dim * ntokens) self.sentiment_embeddings_flags = torch.nn.Embedding( num_embeddings=sample_size, embedding_dim=20) for p in self.sentiment_embeddings_flags.parameters(): torch.nn.init.uniform_(p, a=0.5, b=1.0) self.noise = Normal(loc=0.0, scale=noise_std)
def __init__(self, device, sample_size, padding_index, ntokens=5, embed_dim=512, noise_std=0.1, dropout=0.1, page_size=2**14): """ number of latent-space tokens is constant. """ super().__init__(None) # self.device = device self.number_of_partitions = math.ceil(sample_size / page_size) self.active_partition = -1 self.page_size = page_size self.dropout = dropout self.dim = embed_dim self.ntokens = ntokens self.content_embeddings = nn.ModuleList([ Embedding(page_size, embed_dim * ntokens, padding_index) for i in range(self.number_of_partitions) ]) self.negative_embedding = PositionalEmbedding(num_embeddings=ntokens + 1, embedding_dim=embed_dim, padding_idx=0) self.positive_embedding = PositionalEmbedding(num_embeddings=ntokens + 1, embedding_dim=embed_dim, padding_idx=0) self.noise = Normal(loc=0.0, scale=noise_std)
class NoEncoder(FairseqEncoder): """ The input contain: sequence of latent embedding indecies class index (positive / negative) embed the input and noise the sample embeddings. """ def __init__(self, device, sample_size, padding_index, ntokens=5, embed_dim=512, noise_std=0.1, dropout=0.1): """ number of latent-space tokens is constant. """ super().__init__(None) # self.device = device self.dropout = dropout self.dim = embed_dim self.ntokens = ntokens self.content_embeddings = Embedding( sample_size, embed_dim * ntokens, padding_index) # tokens-encoder, sample-specific # self.content_embeddings = LargeEmbedding(sample_size, embed_dim * ntokens, page_size=1024, num_devices=1, use_cuda=True) # tokens-encoder, sample-specific self.negative_embedding = PositionalEmbedding(num_embeddings=ntokens + 1, embedding_dim=embed_dim, padding_idx=0) self.positive_embedding = PositionalEmbedding(num_embeddings=ntokens + 1, embedding_dim=embed_dim, padding_idx=0) self.noise = Normal(loc=0.0, scale=noise_std) def get_active_parameters(self): return [p for p in self.positive_embedding.parameters()] + \ [p for p in self.negative_embedding.parameters()] + \ [p for p in self.content_embeddings.parameters()] def forward(self, src_tokens, src_lengths): """ src_tokens are two: one for the sentiment (0 or 1), and one for the sample [0.. sample_size] shape is always (batch, 2) src_lengths is (batch)-size array full of 2. """ def foo(emb_layer): for param in emb_layer.parameters(): print(param.shape) print(param) return param batch_size = src_tokens.size()[0] # content embedding and noise content = self.content_embeddings(src_tokens[:, 0]) content = content.view(batch_size, self.ntokens, self.dim) content = content + self.noise.sample(sample_shape=content.size()).to( content.device) # sentiment positional embedding positions = torch.arange(1, self.ntokens + 1).unsqueeze(0).to( content.device) # 1 x ntokens sentiment = src_tokens[:, 1].unsqueeze(1).unsqueeze(2) # batch x 1 x 1 sentiment = self.positive_embedding(positions) * sentiment + \ self.negative_embedding(positions) * (torch.tensor(1) - sentiment) # batch x ntokens x dim x = content + sentiment x = F.dropout(x, p=self.dropout, training=self.training) return {'encoder_out': (x, x), 'encoder_padding_mask': None}
def __init__( self, embed_dim, n_speakers, speaker_embed_dim, in_dim=80, r=5, max_positions=512, padding_idx=None, convolutions=((128, 5, 1), ) * 4, attention=True, dropout=0.1, use_memory_mask=False, force_monotonic_attention=True, query_position_rate=1.0, key_position_rate=1.29, ): super(Decoder, self).__init__() self.dropout = dropout self.in_dim = in_dim self.r = r in_channels = in_dim * r if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) # Position encodings for query (decoder states) and keys (encoder states) self.embed_query_positions = Embedding(max_positions, convolutions[0][0], padding_idx) self.embed_query_positions.weight.data = position_encoding_init( max_positions, convolutions[0][0], position_rate=query_position_rate) self.embed_keys_positions = Embedding(max_positions, embed_dim, padding_idx) self.embed_keys_positions.weight.data = position_encoding_init( max_positions, embed_dim, position_rate=key_position_rate) self.fc1 = Linear(in_channels, convolutions[0][0], dropout=dropout) in_channels = convolutions[0][0] # Causual convolutions self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() Conv1dLayer = Conv1d if has_dilation( convolutions) else LinearizedConv1d for i, (out_channels, kernel_size, dilation) in enumerate(convolutions): pad = (kernel_size - 1) * dilation dilation = (dilation, ) self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( Conv1dLayer(in_channels, out_channels * 2, kernel_size, padding=pad, dilation=dilation, dropout=dropout)) self.attention.append( AttentionLayer(out_channels, embed_dim, dropout=dropout ) if attention[i] else None) in_channels = out_channels self.fc2 = Linear(in_channels, in_dim * r) # decoder states -> Done binary flag self.fc3 = Linear(in_channels, 1) self._is_inference_incremental = False self.max_decoder_steps = 200 self.min_decoder_steps = 10 self.use_memory_mask = use_memory_mask if isinstance(force_monotonic_attention, bool): self.force_monotonic_attention = \ [force_monotonic_attention] * len(convolutions) else: self.force_monotonic_attention = force_monotonic_attention