예제 #1
0
    def __init__(self,
                 vocab_size: int,
                 channel: int,
                 res_channel: int,
                 n_res_blocks: int,
                 n_encoders: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 input_noise=0.0,
                 embed_dropout=0.0,
                 num_vq_embeds: int = 512,
                 vq_embeds_dim: int = None,
                 vq_loss_alpha=0.25,
                 vq_decay=0.99,
                 ignore_quant=False):
        super().__init__()

        self.vocab_size = vocab_size
        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.ignore_quant = ignore_quant

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 5000
        self.blend_step = 0

        self.vq_embeds_dim = vq_embeds_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)

        self.embed = nn.Embedding(vocab_size,
                                  channel,
                                  padding_idx=pad_idx,
                                  max_norm=1.0)

        self.encoder = nn.Sequential(*[
            Encoder(channel, channel, res_channel, n_res_blocks)
            for i in range(n_encoders)
        ])
        self.decoder = nn.Sequential(*[
            Decoder(channel, channel, res_channel, n_res_blocks)
            for i in range(n_encoders)[::-1]
        ])

        self.conv_to_quant = nn.Conv1d(channel, vq_embeds_dim, kernel_size=1)
        self.quant_to_conv = nn.Conv1d(vq_embeds_dim, channel, kernel_size=1)

        self.quantize = Quantize(dim=vq_embeds_dim,
                                 n_embed=num_vq_embeds,
                                 decay=vq_decay)

        self.conv_to_logits = nn.Conv1d(channel, vocab_size, kernel_size=1)

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
예제 #2
0
    def __init__(self,
                 vocab_size: int,
                 channel: int,
                 res_channel: int,
                 n_res_block,
                 tau: float,
                 pad_idx: Union[None, int],
                 input_embed_dim: int,
                 input_noise=0.0,
                 embed_dropout=0.1,
                 num_vq_embeds: int = 512,
                 vq_embeds_dim: int = None,
                 vq_loss_alpha=1.0,
                 ignore_quant=False,
                 **kwargs):
        super().__init__()

        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 10000
        self.blend_step = 0

        self.vq_embeds_dim = vq_embeds_dim
        self.ignore_quant = ignore_quant

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)

        self.embed = nn.Embedding(vocab_size,
                                  input_embed_dim,
                                  padding_idx=pad_idx)
        self.quantize = Quantize(vq_embeds_dim, num_vq_embeds, decay=0.90)

        self.embeds_to_encode = nn.Conv1d(input_embed_dim,
                                          channel,
                                          kernel_size=3,
                                          padding=1)
        self.encode_to_quants = nn.Conv1d(channel,
                                          vq_embeds_dim,
                                          kernel_size=3,
                                          padding=1)
        self.quants_to_decode = nn.Conv1d(vq_embeds_dim,
                                          channel,
                                          kernel_size=3,
                                          padding=1)

        self.encoder = Encoder(channel, res_channel, n_res_block)
        self.decoder = Decoder(channel, vocab_size, res_channel, n_res_block)

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
예제 #3
0
    def __init__(self,
                 vocab_size: int,
                 channel: int,
                 n_fold: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 input_noise=0.0,
                 embed_dropout=0.1,
                 num_vq_embeds: int = 512,
                 vq_embeds_dim: int = None,
                 vq_loss_alpha=1.0,
                 d_slice=1,
                 ignore_quant=False):
        super().__init__()

        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.d_slice = d_slice
        self.ignore_quant = ignore_quant

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 10000
        self.blend_step = 0

        self.vq_embeds_dim = vq_embeds_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)

        self.embed = nn.Embedding(vocab_size, channel, padding_idx=pad_idx)
        self.quantize = SlicedQuantize(d_slice,
                                       dim=vq_embeds_dim,
                                       n_embed=num_vq_embeds,
                                       decay=0.99)

        #self.embeds_to_encode = nn.Conv1d(input_embed_dim, channel, kernel_size=3, padding=1)
        self.encode_to_quants = nn.Conv1d(channel,
                                          vq_embeds_dim,
                                          kernel_size=1)
        #self.quants_to_decode = nn.Conv1d(vq_embeds_dim, channel, kernel_size=1)

        #self.encoder = Encoder(channel, 64, 2)
        self.encoder = nn.Sequential(
            *[EncoderBlock(channel) for i in range(n_fold)])
        self.decoder = Decoder(channel,
                               vocab_size,
                               vq_embeds_dim,
                               n_blocks=int(np.log2(256) - np.log2(16) + 1))

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
    def __init__(self,
                 vocab_size: int,
                 channel: int,
                 n_fold: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 input_noise=0.0,
                 embed_dropout=0.1,
                 num_vq_embeds: int = 512,
                 vq_embeds_dim: int = None,
                 vq_loss_alpha=1.0,
                 d_slice: int = 1,
                 **kwargs):
        super().__init__()

        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.d_slice = d_slice

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_embeds_dim = vq_embeds_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)

        self.embed = nn.Embedding(vocab_size, channel, padding_idx=pad_idx)
        self.quantize = Quantize(vq_embeds_dim // self.d_slice,
                                 num_vq_embeds,
                                 decay=0.99)

        #self.embeds_to_encode = nn.Conv1d(input_embed_dim, channel, kernel_size=3, padding=1)
        self.encode_to_quants = nn.Conv1d(channel,
                                          vq_embeds_dim,
                                          kernel_size=3,
                                          padding=1)
        self.quants_to_decode = nn.Conv1d(vq_embeds_dim,
                                          channel,
                                          kernel_size=3,
                                          padding=1)

        self.encoder = Encoder(channel, n_fold)
        self.decoder = Decoder(channel, n_fold)

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
예제 #5
0
    def __init__(self,
                 vocab_size: int,
                 dim: int,
                 dim_feedforward: int,
                 n_fold: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 input_noise=0.0,
                 embed_dropout=0.1,
                 num_vq_embeds: int = 512,
                 vq_embeds_dim: int = None,
                 vq_loss_alpha=1.0,
                 ignore_quant=False):
        super().__init__()

        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.ignore_quant = ignore_quant

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 10000
        self.blend_step = 0

        self.vq_embeds_dim = vq_embeds_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)

        self.embed = nn.Embedding(vocab_size, dim, padding_idx=pad_idx)
        self.quantize = Quantize(vq_embeds_dim, num_vq_embeds, decay=0.95)

        self.encode_to_quants = nn.Linear(dim, vq_embeds_dim)
        self.quants_to_decode = nn.Linear(vq_embeds_dim, dim)

        self.encoder = nn.Sequential(*[BaseBlock(dim, dim_feedforward, mode='shrink') for _ in range(n_fold)])
        self.decoder = nn.Sequential(*[BaseBlock(dim, dim_feedforward, mode='expand') for _ in range(n_fold)])

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
예제 #6
0
    def __init__(self,
                 vocab_size: int,
                 embed_dim: int,
                 channel: int,
                 res_channel: int,
                 n_res_block: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 input_noise=0.0,
                 embed_dropout=0.0,
                 num_vq_embeds: int = 512,
                 vq_embed_dim: int = 64,
                 vq_loss_alpha=1.0,
                 d_slice=1,
                 ignore_quant=False):
        super().__init__()

        self.vocab_size = vocab_size
        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.d_slice = d_slice
        self.ignore_quant = ignore_quant

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 5000
        self.blend_step = 0

        self.vq_embed_dim = vq_embed_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx, max_norm=1.0)
        #self.pos_encoder = PositionalEncoding(embed_dim, max_len=1024)
        #self.quant_pos_encoder = PositionalEncoding(vq_embed_dim, max_len=256)

        self.enc_a = Encoder(embed_dim, channel, res_channel, n_res_block)
        self.enc_b = Encoder(channel, channel, res_channel, n_res_block)

        self.quantize_conv_b = nn.Conv1d(channel, vq_embed_dim, 1)
        #self.quantize_b = SlicedQuantize(d_slice, dim=vq_embed_dim, n_embed=num_vq_embeds)
        self.quantize_b = Quantize(vq_embed_dim, num_vq_embeds)
        #self.quantize_b = DecomposedQuantize(16, vq_embed_dim, num_vq_embeds)
        self.dec_b = Decoder(vq_embed_dim, vq_embed_dim, channel, res_channel, n_res_block)

        self.quantize_conv_a = nn.Conv1d(vq_embed_dim + channel, vq_embed_dim, 1)
        #self.quantize_a = SlicedQuantize(d_slice, dim=vq_embed_dim, n_embed=num_vq_embeds)
        self.quantize_a = Quantize(vq_embed_dim, num_vq_embeds)
        #self.quantize_a = DecomposedQuantize(64, vq_embed_dim, num_vq_embeds)
        self.upsample_b = nn.Sequential(
            nn.ConvTranspose1d(vq_embed_dim, channel, 4, stride=2, padding=1),
            nn.ELU(),

            nn.ConvTranspose1d(
                channel, vq_embed_dim, 4, stride=2, padding=1
            ),
        )
        self.dec = Decoder(
            vq_embed_dim + vq_embed_dim,
            vocab_size,
            channel,
            res_channel,
            n_res_block,
        )

        self.noise = Noise(alpha=0.05)

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
예제 #7
0
    def __init__(self,
                 vocab_size: int,
                 embed_dim: int,
                 channel: int,
                 res_channel: int,
                 n_res_block: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 eos_idx: int,
                 input_noise=0.0,
                 embed_dropout=0.0,
                 num_vq_embeds: int = 512,
                 vq_embed_dim: int = 64,
                 vq_loss_alpha=1.0,
                 d_slice=1,
                 ignore_quant=False):
        super().__init__()

        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.pad_idx = pad_idx
        self.eox_idx = eos_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.d_slice = d_slice
        self.ignore_quant = ignore_quant

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 5000
        self.blend_step = 0

        self.vq_embed_dim = vq_embed_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)
        self.embed = nn.Embedding(vocab_size,
                                  embed_dim,
                                  padding_idx=pad_idx,
                                  max_norm=1.0)
        self.pos_encoder = PositionalEncoding(embed_dim)

        self.enc1 = Encoder(embed_dim,
                            channel,
                            res_channel,
                            n_res_block,
                            down=2)
        self.enc2 = Encoder(channel, channel, res_channel, n_res_block, down=2)
        self.enc3 = Encoder(channel, channel, res_channel, n_res_block, down=1)

        self.quant_conv_1 = nn.Conv1d(channel, vq_embed_dim, kernel_size=1)
        self.quant_conv_2 = nn.Conv1d(channel, vq_embed_dim, kernel_size=1)
        self.quant_conv_3 = nn.Conv1d(channel, vq_embed_dim, kernel_size=1)

        self.quant_1 = Quantize(vq_embed_dim, num_vq_embeds)
        self.quant_2 = Quantize(vq_embed_dim, num_vq_embeds)
        self.quant_3 = Quantize(vq_embed_dim, num_vq_embeds)

        self.conv_quant_1 = nn.Conv1d(vq_embed_dim, 256, kernel_size=1)
        self.conv_quant_2 = nn.Conv1d(vq_embed_dim, 256, kernel_size=1)
        self.conv_quant_3 = nn.Conv1d(vq_embed_dim, 256, kernel_size=1)

        self.decoder = TransformerDecoder(vocab_size + 1, 256, 4, 1024, 6)

        #self.noise = Noise(alpha=0.05)

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)
예제 #8
0
    def __init__(self,
                 vocab_size: int,
                 embed_dim: int,
                 channel: int,
                 res_channel: int,
                 n_res_block: int,
                 tau: float,
                 pad_idx: Union[None, int],
                 config: List[Tuple[int, int]],
                 input_noise=0.0,
                 embed_dropout=0.0,
                 vq_embed_dim: int = 8,
                 vq_loss_alpha=1.0,
                 attn=False,
                 d_slice=1,
                 ignore_quant=False):
        super().__init__()

        self.vocab_size = vocab_size
        self.pad_idx = pad_idx
        self.tau = tau
        self.vq_loss_alpha = vq_loss_alpha
        self.d_slice = d_slice
        self.ignore_quant = ignore_quant

        self.vq_loss = 0
        self.nll_loss = 0
        self.acc = 0

        self.vq_blend = 0.0
        self.blend_steps = 5000
        self.blend_step = 0

        self.vq_embed_dim = vq_embed_dim

        self.input_noise = CategoricalNoise(vocab_size, input_noise)
        self.embed_dropout = nn.Dropout(embed_dropout)
        self.embed = nn.Embedding(vocab_size,
                                  embed_dim,
                                  padding_idx=pad_idx,
                                  max_norm=1.0)

        self.encoders = nn.ModuleList()
        self.decoders = nn.ModuleList()
        self.cond_decoders = nn.ModuleList()
        self.quant_conv = nn.ModuleList()
        self.quantize = nn.ModuleList()

        for i, (factor, n_embeds) in enumerate(config):
            self.encoders.append(
                Encoder(embed_dim if i == 0 else channel,
                        channel,
                        res_channel,
                        down=factor))

            dec_in_channels = vq_embed_dim * (1 if i + 1 == len(config) else 2)
            dec_out_channels = vocab_size if i == 0 else vq_embed_dim

            self.decoders.append(
                Decoder(dec_in_channels,
                        dec_out_channels,
                        channel,
                        res_channel,
                        up=factor))

            if i + 1 != len(config):
                self.cond_decoders.append(
                    Decoder(vq_embed_dim,
                            vq_embed_dim,
                            channel,
                            res_channel,
                            up=config[i + 1][0],
                            n_res_blocks=1))
            else:
                self.cond_decoders.append(None)

            self.quant_conv.append(
                nn.Conv1d(channel +
                          (0 if i + 1 == len(config) else vq_embed_dim),
                          vq_embed_dim,
                          kernel_size=1))
            #self.quant_conv.append(nn.Conv1d(channel, vq_embed_dim, kernel_size=1))
            self.quantize.append(Quantize(vq_embed_dim, n_embeds))

        self.upsample_t = nn.Sequential(
            nn.ConvTranspose1d(vq_embed_dim, channel, 4, stride=2, padding=1),
            nn.ELU(),
            nn.ConvTranspose1d(channel, vq_embed_dim, 4, stride=2, padding=1),
        )

        self.noise = Noise(alpha=0.05)

        self.nll = nn.NLLLoss(reduction='none', ignore_index=self.pad_idx)