示例#1
0
    def __init__(self, ):
        super(Network, self).__init__()

        self.moduleFeatures = Features()
        levels = [2, 3, 4, 5, 6]
        self.moduleMatching = dg.LayerList([Matching(intLevel) for intLevel in levels])
        self.moduleSubpixel = dg.LayerList([Subpixel(intLevel) for intLevel in levels])
        self.moduleRegularization = dg.LayerList([Regularization(intLevel) for intLevel in levels])
示例#2
0
    def __init__(self,
                 in_channels,
                 reduction_factor,
                 prenet_sizes,
                 layers,
                 kernel_size,
                 attention_dim,
                 position_encoding_weight=1.,
                 omega=1.,
                 has_bias=False,
                 bias_dim=0,
                 keep_prob=1.):
        super(Decoder, self).__init__()
        # prenet-mind the difference of AffineBlock2 and AffineBlock1
        c_in = in_channels
        self.prenet = dg.LayerList()
        for i, c_out in enumerate(prenet_sizes):
            affine = AffineBlock2(c_in,
                                  c_out,
                                  has_bias,
                                  bias_dim,
                                  dropout=(i != 0),
                                  keep_prob=keep_prob)
            self.prenet.append(affine)
            c_in = c_out

        # causal convolutions + multihop attention
        decoder_dim = prenet_sizes[-1]
        self.causal_convs = dg.LayerList()
        self.attention_blocks = dg.LayerList()
        for i in range(layers):
            conv = ConvBlock(decoder_dim, kernel_size, True, has_bias,
                             bias_dim, keep_prob)
            attn = AttentionBlock(attention_dim, decoder_dim,
                                  position_encoding_weight, omega,
                                  reduction_factor, has_bias, bias_dim,
                                  keep_prob)
            self.causal_convs.append(conv)
            self.attention_blocks.append(attn)

        # output mel spectrogram
        output_dim = reduction_factor * in_channels  # r * mel_dim
        std = np.sqrt(1.0 / decoder_dim)
        initializer = I.NormalInitializer(loc=0., scale=std)
        out_affine = dg.Linear(decoder_dim, output_dim, param_attr=initializer)
        self.out_affine = weight_norm(out_affine, dim=-1)
        if has_bias:
            self.out_sp_affine = dg.Linear(bias_dim, output_dim)

        self.has_bias = has_bias
        self.kernel_size = kernel_size

        self.in_channels = in_channels
        self.decoder_dim = decoder_dim
        self.reduction_factor = reduction_factor
        self.out_channels = output_dim
    def __init__(self, num_features, cond_dims, num_filters=128, kernel_size=3, weight_norm_type='',
        separate_projection=False, activation_norm_type='sync_batch', activation_norm_params=None, partial=False):
        super().__init__()
        if activation_norm_params is None:
            activation_norm_params = SimpleNamespace(affine=False)
        padding = kernel_size // 2
        self.separate_projection = separate_projection
        mlps = []
        gammas = []
        betas = []

        # Make cond_dims a list.
        if type(cond_dims) != list:
            cond_dims = [cond_dims]
        
        # Make num_filters a list
        if not isinstance(num_filters, list):
            num_filters = [num_filters] * len(cond_dims)
        else:
            assert len(num_filters) >= len(cond_dims)
        
        # Make partial a list.
        if not isinstance(partial, list):
            partial = [partial] * len(cond_dims)
        else:
            assert len(partial) >= len(cond_dims)
        
        for i, cond_dim in enumerate(cond_dims):
            mlp = []
            conv_block = PartialConv2dBlock if partial[i] else Conv2dBlock
            sequential = PartialSequential if partial[i] else dg.Sequential

            if num_filters[i] > 0:
                mlp += [(str(i), conv_block(cond_dim, num_filters[i], kernel_size, padding=padding,
                                   weight_norm_type=weight_norm_type, nonlinearity='relu'))]
            mlp_ch = cond_dim if num_filters[i] == 0 else num_filters[i]

            if self.separate_projection:
                if partial[i]:
                    raise NotImplementedError("Separate projection not yet implemented for partial conv")
                mlps.append(dg.Sequential(*mlp))
                gammas.append((str(i), conv_block(mlp_ch, num_features, kernel_size, padding=padding, weight_norm_type=weight_norm_type)))
                betas.append((str(i), conv_block(mlp_ch, num_features, kernel_size, padding=padding, weight_norm_type=weight_norm_type)))
            else:
                mlp += [(str(i), conv_block(mlp_ch, num_features * 2, kernel_size, padding=padding, weight_norm_type=weight_norm_type))]
                mlps.append(sequential(*mlp))
        
        self.mlps = dg.LayerList(mlps)
        self.gammas = dg.LayerList(gammas)
        self.betas = dg.LayerList(betas)
        
        self.norm = get_activation_norm_layer(num_features, activation_norm_type, 2, **vars(activation_norm_params))

        self.conditional = True
示例#4
0
 def __init__(self, cfg, name=None):
     super(ErnieEncoderStack, self).__init__()
     n_layers = cfg['num_hidden_layers']
     self.block = D.LayerList([
         ErnieBlock(cfg, append_name(name, 'layer_%d' % i))
         for i in range(n_layers)
     ])
 def __init__(self, num_features, cond_dims, num_filters=0, kernel_size=3,
     weight_norm_type='', activation_norm_type='sync_batch', is_hyper=True):
     super().__init__()
     padding = kernel_size // 2
     mlps = []
     if type(cond_dims) != list:
         cond_dims = [cond_dims]
     
     for i, cond_dim in enumerate(cond_dims):
         mlp = []
         if not is_hyper or (i != 0):
             if num_filters > 0:
                 mlp += [(str(i), Conv2dBlock(cond_dim, num_filters, kernel_size, padding=padding,
                                              weight_norm_type=weight_norm_type, nonlinearity='relu'))]
             mlp_ch = cond_dim if num_filters == 0 else num_filters
             mlp += [(str(len(mlp)), Conv2dBlock(mlp_ch, num_features * 2, kernel_size, 
                                                 padding=padding, weight_norm_type=weight_norm_type))]
             mlp = dg.Sequential(*mlp)
         else:
             if num_filters > 0:
                 raise ValueError('Multi hyper layer not supported yet.')
             mlp = HyperConv2D(padding=padding)
         mlps.append(mlp)
     
     self.mlps = dg.LayerList(mlps)
     self.norm = get_activation_norm_layer(num_features, activation_norm_type, 2, affine=False)
     self.conditional = True
示例#6
0
 def __init__(self, layers, in_channels, encoder_dim, kernel_size, 
              has_bias=False, bias_dim=0, keep_prob=1.):
     super(Encoder, self).__init__()
     self.pre_affine = AffineBlock1(in_channels, encoder_dim, has_bias, bias_dim)
     self.convs = dg.LayerList([
         ConvBlock(encoder_dim, kernel_size, False, has_bias, bias_dim, keep_prob) \
             for _ in range(layers)])
     self.post_affine = AffineBlock1(encoder_dim, in_channels, has_bias, bias_dim)
示例#7
0
 def __init__(self, encoder_layer, num_layers, norm=None):
     super(TransformerEncoder, self).__init__()
     self.layers = dg.LayerList([(encoder_layer if i == 0 else
                               type(encoder_layer)(**encoder_layer._config))
                              for i in range(num_layers)])
     self.num_layers = num_layers
     self.norm = norm
     self.nhead = encoder_layer.nhead
示例#8
0
文件: model.py 项目: leeacord/Contrib
 def __init__(self, scales, num_channels):
     super(ImagePyramide, self).__init__()
     self.downs = dygraph.LayerList()
     self.name_list = []
     for scale in scales:
         self.downs.add_sublayer(
             str(scale).replace('.', '-'),
             AntiAliasInterpolation2d(num_channels, scale))
         self.name_list.append(str(scale).replace('.', '-'))
示例#9
0
 def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
     super().__init__()
     self.layers = dg.LayerList([(decoder_layer if i == 0 else
                               type(decoder_layer)(**decoder_layer._config))
                              for i in range(num_layers)])
     self.num_layers = num_layers
     self.norm = norm
     self.return_intermediate = return_intermediate
     self.nhead = decoder_layer.nhead
示例#10
0
 def __init__(self, scales=(), **kwargs):
     super(MultiScaleDiscriminator, self).__init__()
     self.scales = scales
     self.discs = dygraph.LayerList()
     self.nameList = []
     for scale in scales:
         self.discs.add_sublayer(
             str(scale).replace('.', '-'), Discriminator(**kwargs))
         self.nameList.append(str(scale).replace('.', '-'))
示例#11
0
 def __init__(self, layers, in_channels, postnet_dim, kernel_size, out_channels, upsample_factor, has_bias=False, bias_dim=0, keep_prob=1.):
     super(PostNet, self).__init__()
     self.pre_affine = AffineBlock1(in_channels, postnet_dim, has_bias, bias_dim)
     self.convs = dg.LayerList([
         ConvBlock(postnet_dim, kernel_size, False, has_bias, bias_dim, keep_prob) for _ in range(layers)
     ])
     std = np.sqrt(1.0 / postnet_dim)
     post_affine = dg.Linear(postnet_dim, out_channels, param_attr=I.Normal(scale=std))
     self.post_affine = weight_norm(post_affine, dim=-1)
     self.upsample_factor = upsample_factor
    def __init__(self,
                 n_class=1000,
                 chn=96,
                 blocks_with_attention="B2",
                 resolution=256):
        super().__init__()

        def DBlock(in_channel,
                   out_channel,
                   downsample=True,
                   use_attention=False,
                   skip_proj=None):
            return ResBlock(in_channel,
                            out_channel,
                            conditional=False,
                            upsample=False,
                            downsample=downsample,
                            use_attention=use_attention,
                            skip_proj=skip_proj)

        self.chn = chn
        self.colors = 3
        self.resolution = resolution
        self.blocks_with_attention = set(blocks_with_attention.split(","))
        self.blocks_with_attention.discard('')

        dblock = []
        in_channels, out_channels = self.get_in_out_channels()

        self.sa_ids = [
            int(s.split('B')[-1]) for s in self.blocks_with_attention
        ]

        for i, (nc_in,
                nc_out) in enumerate(zip(in_channels[:-1], out_channels[:-1])):
            dblock.append(
                DBlock(nc_in,
                       nc_out,
                       downsample=True,
                       use_attention=(i + 1) in self.sa_ids,
                       skip_proj=nc_in == nc_out))
        dblock.append(
            DBlock(in_channels[-1],
                   out_channels[-1],
                   downsample=False,
                   use_attention=len(out_channels) in self.sa_ids,
                   skip_proj=in_channels[-1] == out_channels[-1]))
        self.blocks = dg.LayerList(dblock)

        self.final_fc = SpectralNorm(dg.Linear(16 * chn, 1))

        self.embed_y = dg.Embedding(size=[n_class, 16 * chn],
                                    is_sparse=False,
                                    param_attr=Uniform(-0.1, 0.1))
        self.embed_y = SpectralNorm(self.embed_y)
示例#13
0
    def __init__(self):
        super().__init__()
        self.stem = Stem()

        inception_a = []
        for i in range(4):
            inception_a.append(InceptionA(384))
        self.inception_a = dg.LayerList(inception_a)
        self.reduction_a = ReductionA(384)

        inception_b = []
        for i in range(7):
            inception_b.append(InceptionB(1024))
        self.inception_b = dg.LayerList(inception_b)
        self.reduction_b = ReductionB(1024)

        inception_c = []
        for i in range(3):
            inception_c.append(InceptionC(1536))
        self.inception_c = dg.LayerList(inception_c)

        self.pool = dg.Pool2D(pool_type='avg', global_pooling=True)
示例#14
0
文件: bilstm.py 项目: zw331/DDParser
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0):
        super(BiLSTM, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.f_cells = dygraph.LayerList()
        self.b_cells = dygraph.LayerList()
        for _ in range(self.num_layers):
            self.f_cells.append(
                rnn.BasicLSTMUnit(
                    input_size=input_size,
                    hidden_size=hidden_size,
                    param_attr=initializer.Xavier(uniform=False),
                    bias_attr=initializer.ConstantInitializer(value=0.0)))
            self.b_cells.append(
                rnn.BasicLSTMUnit(
                    input_size=input_size,
                    hidden_size=hidden_size,
                    param_attr=initializer.Xavier(uniform=False),
                    bias_attr=initializer.ConstantInitializer(value=0.0)))
            input_size = hidden_size * 2
示例#15
0
文件: util.py 项目: leeacord/Contrib
    def __init__(self,
                 block_expansion,
                 in_features,
                 num_blocks=3,
                 max_features=256):
        super(Encoder, self).__init__()

        down_blocks = []
        for i in range(num_blocks):
            down_blocks.append(
                DownBlock2d(in_features if i == 0 else min(
                    max_features, block_expansion * (2**i)),
                            min(max_features, block_expansion * (2**(i + 1))),
                            kernel_size=3,
                            padding=1))
        self.down_blocks = dygraph.LayerList(down_blocks)
    def __init__(self, num_discriminators=3, kernel_size=3, num_image_channels=3,
        num_filters=64, num_layers=4, max_num_filters=512, activation_norm_type='',
        weight_norm_type='', **kwargs):
        super().__init__()
        for key in kwargs:
            if key != 'type' and key != 'patch_wise':
                warnings.warn("Discriminator argument {} is not used".format(key))

        discriminators = []
        for i in range(num_discriminators):
            net_discriminator = NLayerPatchDiscriminator(kernel_size, num_image_channels,
                num_filters, num_layers, max_num_filters, activation_norm_type, weight_norm_type)
            discriminators.append(net_discriminator)

        self.discriminators = dg.LayerList(discriminators)
        print("Done with the Multi-resolution patch discriminator initialization.")
    def __init__(self,
                 code_dim=128,
                 n_class=1000,
                 chn=96,
                 blocks_with_attention="B4",
                 resolution=512):
        super().__init__()

        def GBlock(in_channel, out_channel, n_class, z_dim, use_attention):
            return ResBlock(in_channel,
                            out_channel,
                            n_class=n_class,
                            z_dim=z_dim,
                            use_attention=use_attention)

        self.embed_y = dg.Linear(n_class, 128, bias_attr=False)

        self.chn = chn
        self.resolution = resolution
        self.blocks_with_attention = set(blocks_with_attention.split(","))
        self.blocks_with_attention.discard('')

        gblock = []
        in_channels, out_channels = self.get_in_out_channels()
        self.num_split = len(in_channels) + 1

        z_dim = code_dim // self.num_split + 128
        self.noise_fc = SpectralNorm(
            dg.Linear(code_dim // self.num_split, 4 * 4 * in_channels[0]))

        self.sa_ids = [
            int(s.split('B')[-1]) for s in self.blocks_with_attention
        ]

        for i, (nc_in, nc_out) in enumerate(zip(in_channels, out_channels)):
            gblock.append(
                GBlock(nc_in,
                       nc_out,
                       n_class=n_class,
                       z_dim=z_dim,
                       use_attention=(i + 1) in self.sa_ids))
        self.blocks = dg.LayerList(gblock)

        self.output_layer_bn = BatchNorm(1 * chn, epsilon=1e-5)
        self.output_layer_conv = SpectralNorm(
            dg.Conv2D(1 * chn, 3, [3, 3], padding=1))
示例#18
0
    def __init__(self, cfg, net_G, net_D, opt_G, opt_D, sch_G, sch_D,
                 train_dataset, val_dataset):
        print("Setup trainer.")

        # Initialize models and data loaders.
        self.cfg = cfg
        self.net_G = net_G
        self.net_D = net_D
        self.opt_G = opt_G
        self.opt_D = opt_D
        self.sch_G = sch_G
        self.sch_D = sch_D
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset

        # Initialize logging attributes.
        self.current_iteration = 0
        self.current_epoch = 0
        self.start_iteration_time = None
        self.elapsed_iteration_time = 0
        self.time_iteration = -1
        self.time_epoch = -1

        self.sequence_length = 1
        self.sequence_length_max = 16

        # Initialize loss functions.
        self.criteria = dg.LayerList()
        # Mapping from loss names to loss weights.
        self.weights = dict()
        self.losses = dict(gen_update=dict(), dis_update=dict())
        self.gen_losses = self.losses['gen_update']
        self.dis_losses = self.losses['dis_update']
        self._init_loss(cfg)

        self.meters = {}

        self.is_inference = cfg.is_inference
        self.has_fg = getattr(cfg.data, 'has_foreground', False)

        self.temporal_network_initialized = False
        self.gt_flow = [None, None]

        self.sample_size = (getattr(cfg.trainer, 'num_videos_to_test', 16),
                            getattr(cfg.trainer, 'num_frames_per_video', 10))
示例#19
0
    def __init__(self, cfg, name=None):
        super(ErnieModelForPretraining, self).__init__(cfg, name=name)
        initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range'])
        d_model = cfg['hidden_size']
        d_vocab = cfg['vocab_size']

        self.pooler_heads = D.LayerList([NSPHead(cfg, name=name)])
        self.mlm = _build_linear(d_model, d_model, append_name(name, 'mask_lm_trans_fc'), initializer, act=cfg['hidden_act'])
        self.mlm_ln = _build_ln(d_model, name = append_name(name, 'mask_lm_trans'))
        self.mlm_bias = L.create_parameter(
                dtype='float32',
                shape=[d_vocab], 
                attr=F.ParamAttr(
                    name=append_name(name, 'mask_lm_out_fc.b_0'), 
                    initializer=F.initializer.Constant(value=0.0)
                    ),
                is_bias=True,
            )
示例#20
0
    def __init__(self, n_loops, n_layers, residual_channels, condition_dim,
                 filter_size):
        """ParallelWaveNet, an inverse autoregressive flow model, it contains several flows(WaveNets).

        Args:
            n_loops (List[int]): `n_loop` for each flow.
            n_layers (List[int]): `n_layer` for each flow.
            residual_channels (int): `residual_channels` for every flow.
            condition_dim (int): `condition_dim` for every flow.
            filter_size (int): `filter_size` for every flow.
        """
        super(ParallelWaveNet, self).__init__()
        self.flows = dg.LayerList()
        for n_loop, n_layer in zip(n_loops, n_layers):
            # teacher's log_scale_min does not matter herem, -100 is a dummy value
            self.flows.append(
                WaveNet(n_loop, n_layer, residual_channels, 3, condition_dim,
                        filter_size, "mog", -100.0))
示例#21
0
文件: util.py 项目: leeacord/Contrib
    def __init__(self,
                 block_expansion,
                 in_features,
                 num_blocks=3,
                 max_features=256):
        super(Decoder, self).__init__()

        up_blocks = []

        for i in range(num_blocks)[::-1]:
            in_filters = (1 if i == num_blocks - 1 else 2) * min(
                max_features, block_expansion * (2**(i + 1)))
            out_filters = min(max_features, block_expansion * (2**i))
            up_blocks.append(
                UpBlock2d(in_filters, out_filters, kernel_size=3, padding=1))

        self.up_blocks = dygraph.LayerList(up_blocks)
        self.out_filters = block_expansion + in_features
示例#22
0
    def __init__(self,
            num_class,
            vocab_size,
            emb_dim=32,
            num_filters=10,
            fc_hid_dim=32,
            num_channels=1,
            win_size_list=None,
            is_sparse=True,
            use_cudnn=True,
            ):
        super(TextCNN, self).__init__()

        self.embedding = D.Embedding(
            size=[vocab_size, emb_dim],
            dtype='float32',
            is_sparse=is_sparse)

        logging.info("num_class    = {}".format(num_class))
        logging.info("vocab size   = {}".format(vocab_size))
        logging.info("emb_dim      = {}".format(emb_dim))
        logging.info("num filters  = {}".format(num_filters))
        logging.info("fc_hid_dim   = {}".format(fc_hid_dim))
        logging.info("num channels = {}".format(num_channels))
        logging.info("windows size = {}".format(win_size_list))
        logging.info("is sparse    = {}".format(is_sparse))
        logging.info("use cudnn    = {}".format(use_cudnn))

        win_size_list = [3] if win_size_list is None else win_size_list
        def gen_conv_pool(win_size):
            """生成指定窗口的卷积池化层
            """
            return ConvPool(
                    num_channels,
                    num_filters,
                    [win_size, emb_dim],
                    padding=[1, 0],
                    use_cudnn=use_cudnn,
                    )

        self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list])

        self._hid_fc = D.Linear(input_dim=num_filters * len(win_size_list), output_dim=fc_hid_dim, act="tanh")
        self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
示例#23
0
文件: net.py 项目: sshuster/Parakeet
    def __init__(self, upscale_factors=[16, 16]):
        """UpsamplingNet.
        It consists of several layers of Conv2DTranspose. Each Conv2DTranspose layer upsamples the time dimension by its `stride` times. And each Conv2DTranspose's filter_size at frequency dimension is 3.

        Args:
            upscale_factors (list[int], optional): time upsampling factors for each Conv2DTranspose Layer. The `UpsampleNet` contains len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor is used as the `stride` for the corresponding Conv2DTranspose. Defaults to [16, 16].
        Note:
            np.prod(upscale_factors) should equals the `hop_length` of the stft transformation used to extract spectrogram features from audios. For example, 16 * 16 = 256, then the spectram extracted using a stft transformation whose `hop_length` is 256. See `librosa.stft` for more details.
        """
        super(UpsampleNet, self).__init__()
        self.upscale_factors = list(upscale_factors)
        self.upsample_convs = dg.LayerList()
        for i, factor in enumerate(upscale_factors):
            self.upsample_convs.append(
                Conv2DTranspose(1,
                                1,
                                filter_size=(3, 2 * factor),
                                stride=(1, factor),
                                padding=(1, factor // 2)))
示例#24
0
    def __init__(self, n_loop, n_layer, residual_channels, condition_dim,
                 filter_size):
        """The residual network in wavenet. It consists of `n_layer` stacks, each of which consists of `n_loop` ResidualBlocks.

        Args:
            n_loop (int): number of ResidualBlocks in a stack.
            n_layer (int): number of stacks in the `ResidualNet`.
            residual_channels (int): channels of each `ResidualBlock`'s input.
            condition_dim (int): channels of the condition.
            filter_size (int): filter size of the internal Conv1DCell of each `ResidualBlock`.
        """
        super(ResidualNet, self).__init__()
        # double the dilation at each layer in a loop(n_loop layers)
        dilations = [2**i for i in range(n_loop)] * n_layer
        self.context_size = 1 + sum(dilations)
        self.residual_blocks = dg.LayerList([
            ResidualBlock(residual_channels, condition_dim, filter_size,
                          dilation) for dilation in dilations
        ])
示例#25
0
    def __init__(self,
            emb_dim,
            num_filters=10,
            num_channels=1,
            win_size_list=None,
            use_cudnn=True,
            ):
        super(TextCNNLayer, self).__init__()

        if win_size_list is None:
            win_size_list = [3]

        def gen_conv_pool(win_size):
            """生成指定窗口的卷积池化层
            """
            return ConvPoolLayer(
                    num_channels,
                    num_filters,
                    [win_size, emb_dim],
                    padding=[1, 0],
                    use_cudnn=use_cudnn,
                    )

        self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list])
示例#26
0
    def __init__(self,
                 num_channels=3,
                 block_expansion=64,
                 num_blocks=4,
                 max_features=512,
                 sn=False,
                 use_kp=False,
                 num_kp=10,
                 kp_variance=0.01,
                 **kwargs):
        super(Discriminator, self).__init__()

        down_blocks = []
        for i in range(num_blocks):
            down_blocks.append(
                DownBlock2d(num_channels + num_kp * use_kp if i == 0 else min(
                    max_features, block_expansion * (2**i)),
                            min(max_features, block_expansion * (2**(i + 1))),
                            norm=(i != 0),
                            kernel_size=4,
                            pool=(i != num_blocks - 1),
                            sn=sn))

        self.down_blocks = dygraph.LayerList(down_blocks)
        self.conv = dygraph.Conv2D(
            self.down_blocks[len(self.down_blocks) -
                             1].conv.parameters()[0].shape[0],
            1,
            filter_size=1)
        if sn:
            self.sn = dygraph.SpectralNorm(self.conv.parameters()[0].shape,
                                           dim=0)
        else:
            self.sn = None
        self.use_kp = use_kp
        self.kp_variance = kp_variance
示例#27
0
 def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
     super().__init__()
     self.num_layers = num_layers
     h = [hidden_dim] * (num_layers - 1)
     self.layers = dg.LayerList(
         dg.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
示例#28
0
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 linear_dim,
                 convolutions=(ConvSpec(256, 5, 1), ) * 4,
                 time_upsampling=1,
                 dropout=0.0):
        """Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform.

        Args:
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            in_channels (int): channels of the input.
            linear_dim (int): channels of the linear spectrogram.
            convolutions (Iterable[ConvSpec], optional): specifications of the internal convolutional layers. ConvSpec is a namedtuple of (output_channels, filter_size, dilation) Defaults to (ConvSpec(256, 5, 1), )*4.
            time_upsampling (int, optional): time upsampling factor of the converter, possible options are {1, 2, 4}. Note that this should equals the downsample factor of the mel spectrogram. Defaults to 1.
            dropout (float, optional): dropout probability. Defaults to 0.0.
        """
        super(Converter, self).__init__()

        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.in_channels = in_channels
        self.linear_dim = linear_dim
        # CAUTION: this should equals the downsampling steps coefficient
        self.time_upsampling = time_upsampling
        self.dropout = dropout

        target_channels = convolutions[0].out_channels

        # conv proj to target channels
        self.first_conv_proj = Conv1D(
            in_channels,
            target_channels,
            1,
            param_attr=I.Normal(scale=np.sqrt(1 / in_channels)))

        # Idea from nyanko
        if time_upsampling == 4:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_4x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        elif time_upsampling == 2:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_2x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        elif time_upsampling == 1:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_1x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        else:
            raise ValueError(
                "Upsampling factors other than {1, 2, 4} are Not supported.")

        # post conv layers
        std_mul = 4.0
        in_channels = target_channels
        self.convolutions = dg.LayerList()
        for (out_channels, filter_size, dilation) in convolutions:
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation=dilation,
                          std_mul=std_mul,
                          dropout=dropout))
            in_channels = out_channels
            std_mul = 4.0

        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
        self.last_conv_proj = Conv1D(in_channels,
                                     linear_dim,
                                     1,
                                     act="sigmoid",
                                     param_attr=I.Normal(scale=std))
示例#29
0
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 embed_dim,
                 mel_dim,
                 r=1,
                 max_positions=512,
                 preattention=(ConvSpec(128, 5, 1), ) * 4,
                 convolutions=(ConvSpec(128, 5, 1), ) * 4,
                 attention=True,
                 dropout=0.0,
                 use_memory_mask=False,
                 force_monotonic_attention=False,
                 query_position_rate=1.0,
                 key_position_rate=1.0,
                 window_range=WindowRange(-1, 3),
                 key_projection=True,
                 value_projection=True):
        """Decoder of the Deep Voice 3 model.

        Args:
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            embed_dim (int): text embedding size.
            mel_dim (int): channel of mel input.(mel bands)
            r (int, optional): number of frames generated per decoder step. Defaults to 1.
            max_positions (int, optional): max position for text and decoder steps. Defaults to 512.
            convolutions (Iterable[ConvSpec], optional): specification of causal convolutional layers inside the decoder. ConvSpec is a namedtuple of output_channels, filter_size and dilation. Defaults to (ConvSpec(128, 5, 1), )*4.
            attention (bool or List[bool], optional): whether to use attention, it should have the same length with `convolutions` if it is a list of bool, indicating whether to have an Attention layer coupled with the corresponding convolutional layer. If it is a bool, it is repeated len(convolutions) times internally. Defaults to True.
            dropout (float, optional): dropout probability. Defaults to 0.0.
            use_memory_mask (bool, optional): whether to use memory mask at the Attention layer. It should have the same length with `attention` if it is a list of bool, indicating whether to use memory mask at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False.
            force_monotonic_attention (bool, optional): whether to use monotonic_attention at the Attention layer when inferencing. It should have the same length with `attention` if it is a list of bool, indicating whether to use monotonic_attention at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False.
            query_position_rate (float, optional): position_rate of the PositionEmbedding for query. Defaults to 1.0.
            key_position_rate (float, optional): position_rate of the PositionEmbedding for key. Defaults to 1.0.
            window_range (WindowRange, optional): window range of monotonic attention. Defaults to WindowRange(-1, 3).
            key_projection (bool, optional): `key_projection` of Attention layers. Defaults to True.
            value_projection (bool, optional): `value_projection` of Attention layers Defaults to True.
        """
        super(Decoder, self).__init__()

        self.dropout = dropout
        self.mel_dim = mel_dim
        self.r = r
        self.query_position_rate = query_position_rate
        self.key_position_rate = key_position_rate
        self.window_range = window_range
        self.n_speakers = n_speakers

        conv_channels = convolutions[0].out_channels
        # only when padding idx is 0 can we easilt handle it
        self.embed_keys_positions = PositionEmbedding(max_positions, embed_dim)
        self.embed_query_positions = PositionEmbedding(max_positions,
                                                       conv_channels)

        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
            self.speaker_proj1 = Linear(speaker_dim,
                                        1,
                                        act="sigmoid",
                                        param_attr=I.Normal(scale=std))
            self.speaker_proj2 = Linear(speaker_dim,
                                        1,
                                        act="sigmoid",
                                        param_attr=I.Normal(scale=std))

        # prenet
        self.prenet = dg.LayerList()
        in_channels = mel_dim * r  # multiframe
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in preattention:
            if in_channels != out_channels:
                # conv1d & relu
                std = np.sqrt(std_mul / in_channels)
                self.prenet.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation,
                          std_mul,
                          dropout,
                          causal=True,
                          residual=True))
            in_channels = out_channels
            std_mul = 4.0

        # attention
        self.use_memory_mask = use_memory_mask
        if isinstance(attention, bool):
            self.attention = [attention] * len(convolutions)
        else:
            self.attention = attention

        if isinstance(force_monotonic_attention, bool):
            self.force_monotonic_attention = [force_monotonic_attention
                                              ] * len(convolutions)
        else:
            self.force_monotonic_attention = force_monotonic_attention

        for x, y in zip(self.force_monotonic_attention, self.attention):
            if x is True and y is False:
                raise ValueError("When not using attention, there is no "
                                 "monotonic attention at all")

        # causual convolution & attention
        self.conv_attn = []
        for use_attention, (out_channels, filter_size,
                            dilation) in zip(self.attention, convolutions):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
            conv_layer = Conv1DGLU(n_speakers,
                                   speaker_dim,
                                   in_channels,
                                   out_channels,
                                   filter_size,
                                   dilation,
                                   std_mul,
                                   dropout,
                                   causal=True,
                                   residual=False)
            attn_layer = Attention(
                out_channels,
                embed_dim,
                dropout,
                window_range,
                key_projection=key_projection,
                value_projection=value_projection) if use_attention else None
            in_channels = out_channels
            std_mul = 4.0
            self.conv_attn.append((conv_layer, attn_layer))
        for i, (conv_layer, attn_layer) in enumerate(self.conv_attn):
            self.add_sublayer("conv_{}".format(i), conv_layer)
            if attn_layer is not None:
                self.add_sublayer("attn_{}".format(i), attn_layer)

        # 1 * 1 conv to transform channels
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.last_conv = Conv1D(in_channels,
                                mel_dim * r,
                                1,
                                param_attr=I.Normal(scale=std))

        # mel (before sigmoid) to done hat
        std = np.sqrt(1 / in_channels)
        self.fc = Conv1D(mel_dim * r, 1, 1, param_attr=I.Normal(scale=std))

        # decoding configs
        self.max_decoder_steps = 200
        self.min_decoder_steps = 10

        assert convolutions[-1].out_channels % r == 0, \
                "decoder_state dim must be divided by r"
        self.state_dim = convolutions[-1].out_channels // self.r
示例#30
0
    def __init__(self,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_dim,
                 padding_idx=None,
                 embedding_weight_std=0.1,
                 convolutions=(ConvSpec(64, 5, 1), ) * 7,
                 dropout=0.):
        """Encoder of Deep Voice 3.

        Args:
            n_vocab (int): vocabulary size of the text embedding.
            embed_dim (int): embedding size of the text embedding.
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            padding_idx (int, optional): padding index of text embedding. Defaults to None.
            embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1.
            convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7.
            dropout (float, optional): dropout probability. Defaults to 0..
        """
        super(Encoder, self).__init__()
        self.embedding_weight_std = embedding_weight_std
        self.embed = dg.Embedding(
            (n_vocab, embed_dim),
            padding_idx=padding_idx,
            param_attr=I.Normal(scale=embedding_weight_std))

        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
            self.sp_proj1 = Linear(speaker_dim,
                                   embed_dim,
                                   act="softsign",
                                   param_attr=I.Normal(scale=std))
            self.sp_proj2 = Linear(speaker_dim,
                                   embed_dim,
                                   act="softsign",
                                   param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers

        self.convolutions = dg.LayerList()
        in_channels = embed_dim
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in convolutions:
            # 1 * 1 convolution & relu
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0

            self.convolutions.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation,
                          std_mul,
                          dropout,
                          causal=False,
                          residual=True))
            in_channels = out_channels
            std_mul = 4.0

        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))