def __init__(self, ): super(Network, self).__init__() self.moduleFeatures = Features() levels = [2, 3, 4, 5, 6] self.moduleMatching = dg.LayerList([Matching(intLevel) for intLevel in levels]) self.moduleSubpixel = dg.LayerList([Subpixel(intLevel) for intLevel in levels]) self.moduleRegularization = dg.LayerList([Regularization(intLevel) for intLevel in levels])
def __init__(self, in_channels, reduction_factor, prenet_sizes, layers, kernel_size, attention_dim, position_encoding_weight=1., omega=1., has_bias=False, bias_dim=0, keep_prob=1.): super(Decoder, self).__init__() # prenet-mind the difference of AffineBlock2 and AffineBlock1 c_in = in_channels self.prenet = dg.LayerList() for i, c_out in enumerate(prenet_sizes): affine = AffineBlock2(c_in, c_out, has_bias, bias_dim, dropout=(i != 0), keep_prob=keep_prob) self.prenet.append(affine) c_in = c_out # causal convolutions + multihop attention decoder_dim = prenet_sizes[-1] self.causal_convs = dg.LayerList() self.attention_blocks = dg.LayerList() for i in range(layers): conv = ConvBlock(decoder_dim, kernel_size, True, has_bias, bias_dim, keep_prob) attn = AttentionBlock(attention_dim, decoder_dim, position_encoding_weight, omega, reduction_factor, has_bias, bias_dim, keep_prob) self.causal_convs.append(conv) self.attention_blocks.append(attn) # output mel spectrogram output_dim = reduction_factor * in_channels # r * mel_dim std = np.sqrt(1.0 / decoder_dim) initializer = I.NormalInitializer(loc=0., scale=std) out_affine = dg.Linear(decoder_dim, output_dim, param_attr=initializer) self.out_affine = weight_norm(out_affine, dim=-1) if has_bias: self.out_sp_affine = dg.Linear(bias_dim, output_dim) self.has_bias = has_bias self.kernel_size = kernel_size self.in_channels = in_channels self.decoder_dim = decoder_dim self.reduction_factor = reduction_factor self.out_channels = output_dim
def __init__(self, num_features, cond_dims, num_filters=128, kernel_size=3, weight_norm_type='', separate_projection=False, activation_norm_type='sync_batch', activation_norm_params=None, partial=False): super().__init__() if activation_norm_params is None: activation_norm_params = SimpleNamespace(affine=False) padding = kernel_size // 2 self.separate_projection = separate_projection mlps = [] gammas = [] betas = [] # Make cond_dims a list. if type(cond_dims) != list: cond_dims = [cond_dims] # Make num_filters a list if not isinstance(num_filters, list): num_filters = [num_filters] * len(cond_dims) else: assert len(num_filters) >= len(cond_dims) # Make partial a list. if not isinstance(partial, list): partial = [partial] * len(cond_dims) else: assert len(partial) >= len(cond_dims) for i, cond_dim in enumerate(cond_dims): mlp = [] conv_block = PartialConv2dBlock if partial[i] else Conv2dBlock sequential = PartialSequential if partial[i] else dg.Sequential if num_filters[i] > 0: mlp += [(str(i), conv_block(cond_dim, num_filters[i], kernel_size, padding=padding, weight_norm_type=weight_norm_type, nonlinearity='relu'))] mlp_ch = cond_dim if num_filters[i] == 0 else num_filters[i] if self.separate_projection: if partial[i]: raise NotImplementedError("Separate projection not yet implemented for partial conv") mlps.append(dg.Sequential(*mlp)) gammas.append((str(i), conv_block(mlp_ch, num_features, kernel_size, padding=padding, weight_norm_type=weight_norm_type))) betas.append((str(i), conv_block(mlp_ch, num_features, kernel_size, padding=padding, weight_norm_type=weight_norm_type))) else: mlp += [(str(i), conv_block(mlp_ch, num_features * 2, kernel_size, padding=padding, weight_norm_type=weight_norm_type))] mlps.append(sequential(*mlp)) self.mlps = dg.LayerList(mlps) self.gammas = dg.LayerList(gammas) self.betas = dg.LayerList(betas) self.norm = get_activation_norm_layer(num_features, activation_norm_type, 2, **vars(activation_norm_params)) self.conditional = True
def __init__(self, cfg, name=None): super(ErnieEncoderStack, self).__init__() n_layers = cfg['num_hidden_layers'] self.block = D.LayerList([ ErnieBlock(cfg, append_name(name, 'layer_%d' % i)) for i in range(n_layers) ])
def __init__(self, num_features, cond_dims, num_filters=0, kernel_size=3, weight_norm_type='', activation_norm_type='sync_batch', is_hyper=True): super().__init__() padding = kernel_size // 2 mlps = [] if type(cond_dims) != list: cond_dims = [cond_dims] for i, cond_dim in enumerate(cond_dims): mlp = [] if not is_hyper or (i != 0): if num_filters > 0: mlp += [(str(i), Conv2dBlock(cond_dim, num_filters, kernel_size, padding=padding, weight_norm_type=weight_norm_type, nonlinearity='relu'))] mlp_ch = cond_dim if num_filters == 0 else num_filters mlp += [(str(len(mlp)), Conv2dBlock(mlp_ch, num_features * 2, kernel_size, padding=padding, weight_norm_type=weight_norm_type))] mlp = dg.Sequential(*mlp) else: if num_filters > 0: raise ValueError('Multi hyper layer not supported yet.') mlp = HyperConv2D(padding=padding) mlps.append(mlp) self.mlps = dg.LayerList(mlps) self.norm = get_activation_norm_layer(num_features, activation_norm_type, 2, affine=False) self.conditional = True
def __init__(self, layers, in_channels, encoder_dim, kernel_size, has_bias=False, bias_dim=0, keep_prob=1.): super(Encoder, self).__init__() self.pre_affine = AffineBlock1(in_channels, encoder_dim, has_bias, bias_dim) self.convs = dg.LayerList([ ConvBlock(encoder_dim, kernel_size, False, has_bias, bias_dim, keep_prob) \ for _ in range(layers)]) self.post_affine = AffineBlock1(encoder_dim, in_channels, has_bias, bias_dim)
def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = dg.LayerList([(encoder_layer if i == 0 else type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm self.nhead = encoder_layer.nhead
def __init__(self, scales, num_channels): super(ImagePyramide, self).__init__() self.downs = dygraph.LayerList() self.name_list = [] for scale in scales: self.downs.add_sublayer( str(scale).replace('.', '-'), AntiAliasInterpolation2d(num_channels, scale)) self.name_list.append(str(scale).replace('.', '-'))
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): super().__init__() self.layers = dg.LayerList([(decoder_layer if i == 0 else type(decoder_layer)(**decoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate self.nhead = decoder_layer.nhead
def __init__(self, scales=(), **kwargs): super(MultiScaleDiscriminator, self).__init__() self.scales = scales self.discs = dygraph.LayerList() self.nameList = [] for scale in scales: self.discs.add_sublayer( str(scale).replace('.', '-'), Discriminator(**kwargs)) self.nameList.append(str(scale).replace('.', '-'))
def __init__(self, layers, in_channels, postnet_dim, kernel_size, out_channels, upsample_factor, has_bias=False, bias_dim=0, keep_prob=1.): super(PostNet, self).__init__() self.pre_affine = AffineBlock1(in_channels, postnet_dim, has_bias, bias_dim) self.convs = dg.LayerList([ ConvBlock(postnet_dim, kernel_size, False, has_bias, bias_dim, keep_prob) for _ in range(layers) ]) std = np.sqrt(1.0 / postnet_dim) post_affine = dg.Linear(postnet_dim, out_channels, param_attr=I.Normal(scale=std)) self.post_affine = weight_norm(post_affine, dim=-1) self.upsample_factor = upsample_factor
def __init__(self, n_class=1000, chn=96, blocks_with_attention="B2", resolution=256): super().__init__() def DBlock(in_channel, out_channel, downsample=True, use_attention=False, skip_proj=None): return ResBlock(in_channel, out_channel, conditional=False, upsample=False, downsample=downsample, use_attention=use_attention, skip_proj=skip_proj) self.chn = chn self.colors = 3 self.resolution = resolution self.blocks_with_attention = set(blocks_with_attention.split(",")) self.blocks_with_attention.discard('') dblock = [] in_channels, out_channels = self.get_in_out_channels() self.sa_ids = [ int(s.split('B')[-1]) for s in self.blocks_with_attention ] for i, (nc_in, nc_out) in enumerate(zip(in_channels[:-1], out_channels[:-1])): dblock.append( DBlock(nc_in, nc_out, downsample=True, use_attention=(i + 1) in self.sa_ids, skip_proj=nc_in == nc_out)) dblock.append( DBlock(in_channels[-1], out_channels[-1], downsample=False, use_attention=len(out_channels) in self.sa_ids, skip_proj=in_channels[-1] == out_channels[-1])) self.blocks = dg.LayerList(dblock) self.final_fc = SpectralNorm(dg.Linear(16 * chn, 1)) self.embed_y = dg.Embedding(size=[n_class, 16 * chn], is_sparse=False, param_attr=Uniform(-0.1, 0.1)) self.embed_y = SpectralNorm(self.embed_y)
def __init__(self): super().__init__() self.stem = Stem() inception_a = [] for i in range(4): inception_a.append(InceptionA(384)) self.inception_a = dg.LayerList(inception_a) self.reduction_a = ReductionA(384) inception_b = [] for i in range(7): inception_b.append(InceptionB(1024)) self.inception_b = dg.LayerList(inception_b) self.reduction_b = ReductionB(1024) inception_c = [] for i in range(3): inception_c.append(InceptionC(1536)) self.inception_c = dg.LayerList(inception_c) self.pool = dg.Pool2D(pool_type='avg', global_pooling=True)
def __init__(self, input_size, hidden_size, num_layers=1, dropout=0): super(BiLSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = dropout self.f_cells = dygraph.LayerList() self.b_cells = dygraph.LayerList() for _ in range(self.num_layers): self.f_cells.append( rnn.BasicLSTMUnit( input_size=input_size, hidden_size=hidden_size, param_attr=initializer.Xavier(uniform=False), bias_attr=initializer.ConstantInitializer(value=0.0))) self.b_cells.append( rnn.BasicLSTMUnit( input_size=input_size, hidden_size=hidden_size, param_attr=initializer.Xavier(uniform=False), bias_attr=initializer.ConstantInitializer(value=0.0))) input_size = hidden_size * 2
def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): super(Encoder, self).__init__() down_blocks = [] for i in range(num_blocks): down_blocks.append( DownBlock2d(in_features if i == 0 else min( max_features, block_expansion * (2**i)), min(max_features, block_expansion * (2**(i + 1))), kernel_size=3, padding=1)) self.down_blocks = dygraph.LayerList(down_blocks)
def __init__(self, num_discriminators=3, kernel_size=3, num_image_channels=3, num_filters=64, num_layers=4, max_num_filters=512, activation_norm_type='', weight_norm_type='', **kwargs): super().__init__() for key in kwargs: if key != 'type' and key != 'patch_wise': warnings.warn("Discriminator argument {} is not used".format(key)) discriminators = [] for i in range(num_discriminators): net_discriminator = NLayerPatchDiscriminator(kernel_size, num_image_channels, num_filters, num_layers, max_num_filters, activation_norm_type, weight_norm_type) discriminators.append(net_discriminator) self.discriminators = dg.LayerList(discriminators) print("Done with the Multi-resolution patch discriminator initialization.")
def __init__(self, code_dim=128, n_class=1000, chn=96, blocks_with_attention="B4", resolution=512): super().__init__() def GBlock(in_channel, out_channel, n_class, z_dim, use_attention): return ResBlock(in_channel, out_channel, n_class=n_class, z_dim=z_dim, use_attention=use_attention) self.embed_y = dg.Linear(n_class, 128, bias_attr=False) self.chn = chn self.resolution = resolution self.blocks_with_attention = set(blocks_with_attention.split(",")) self.blocks_with_attention.discard('') gblock = [] in_channels, out_channels = self.get_in_out_channels() self.num_split = len(in_channels) + 1 z_dim = code_dim // self.num_split + 128 self.noise_fc = SpectralNorm( dg.Linear(code_dim // self.num_split, 4 * 4 * in_channels[0])) self.sa_ids = [ int(s.split('B')[-1]) for s in self.blocks_with_attention ] for i, (nc_in, nc_out) in enumerate(zip(in_channels, out_channels)): gblock.append( GBlock(nc_in, nc_out, n_class=n_class, z_dim=z_dim, use_attention=(i + 1) in self.sa_ids)) self.blocks = dg.LayerList(gblock) self.output_layer_bn = BatchNorm(1 * chn, epsilon=1e-5) self.output_layer_conv = SpectralNorm( dg.Conv2D(1 * chn, 3, [3, 3], padding=1))
def __init__(self, cfg, net_G, net_D, opt_G, opt_D, sch_G, sch_D, train_dataset, val_dataset): print("Setup trainer.") # Initialize models and data loaders. self.cfg = cfg self.net_G = net_G self.net_D = net_D self.opt_G = opt_G self.opt_D = opt_D self.sch_G = sch_G self.sch_D = sch_D self.train_dataset = train_dataset self.val_dataset = val_dataset # Initialize logging attributes. self.current_iteration = 0 self.current_epoch = 0 self.start_iteration_time = None self.elapsed_iteration_time = 0 self.time_iteration = -1 self.time_epoch = -1 self.sequence_length = 1 self.sequence_length_max = 16 # Initialize loss functions. self.criteria = dg.LayerList() # Mapping from loss names to loss weights. self.weights = dict() self.losses = dict(gen_update=dict(), dis_update=dict()) self.gen_losses = self.losses['gen_update'] self.dis_losses = self.losses['dis_update'] self._init_loss(cfg) self.meters = {} self.is_inference = cfg.is_inference self.has_fg = getattr(cfg.data, 'has_foreground', False) self.temporal_network_initialized = False self.gt_flow = [None, None] self.sample_size = (getattr(cfg.trainer, 'num_videos_to_test', 16), getattr(cfg.trainer, 'num_frames_per_video', 10))
def __init__(self, cfg, name=None): super(ErnieModelForPretraining, self).__init__(cfg, name=name) initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) d_model = cfg['hidden_size'] d_vocab = cfg['vocab_size'] self.pooler_heads = D.LayerList([NSPHead(cfg, name=name)]) self.mlm = _build_linear(d_model, d_model, append_name(name, 'mask_lm_trans_fc'), initializer, act=cfg['hidden_act']) self.mlm_ln = _build_ln(d_model, name = append_name(name, 'mask_lm_trans')) self.mlm_bias = L.create_parameter( dtype='float32', shape=[d_vocab], attr=F.ParamAttr( name=append_name(name, 'mask_lm_out_fc.b_0'), initializer=F.initializer.Constant(value=0.0) ), is_bias=True, )
def __init__(self, n_loops, n_layers, residual_channels, condition_dim, filter_size): """ParallelWaveNet, an inverse autoregressive flow model, it contains several flows(WaveNets). Args: n_loops (List[int]): `n_loop` for each flow. n_layers (List[int]): `n_layer` for each flow. residual_channels (int): `residual_channels` for every flow. condition_dim (int): `condition_dim` for every flow. filter_size (int): `filter_size` for every flow. """ super(ParallelWaveNet, self).__init__() self.flows = dg.LayerList() for n_loop, n_layer in zip(n_loops, n_layers): # teacher's log_scale_min does not matter herem, -100 is a dummy value self.flows.append( WaveNet(n_loop, n_layer, residual_channels, 3, condition_dim, filter_size, "mog", -100.0))
def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): super(Decoder, self).__init__() up_blocks = [] for i in range(num_blocks)[::-1]: in_filters = (1 if i == num_blocks - 1 else 2) * min( max_features, block_expansion * (2**(i + 1))) out_filters = min(max_features, block_expansion * (2**i)) up_blocks.append( UpBlock2d(in_filters, out_filters, kernel_size=3, padding=1)) self.up_blocks = dygraph.LayerList(up_blocks) self.out_filters = block_expansion + in_features
def __init__(self, num_class, vocab_size, emb_dim=32, num_filters=10, fc_hid_dim=32, num_channels=1, win_size_list=None, is_sparse=True, use_cudnn=True, ): super(TextCNN, self).__init__() self.embedding = D.Embedding( size=[vocab_size, emb_dim], dtype='float32', is_sparse=is_sparse) logging.info("num_class = {}".format(num_class)) logging.info("vocab size = {}".format(vocab_size)) logging.info("emb_dim = {}".format(emb_dim)) logging.info("num filters = {}".format(num_filters)) logging.info("fc_hid_dim = {}".format(fc_hid_dim)) logging.info("num channels = {}".format(num_channels)) logging.info("windows size = {}".format(win_size_list)) logging.info("is sparse = {}".format(is_sparse)) logging.info("use cudnn = {}".format(use_cudnn)) win_size_list = [3] if win_size_list is None else win_size_list def gen_conv_pool(win_size): """生成指定窗口的卷积池化层 """ return ConvPool( num_channels, num_filters, [win_size, emb_dim], padding=[1, 0], use_cudnn=use_cudnn, ) self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list]) self._hid_fc = D.Linear(input_dim=num_filters * len(win_size_list), output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def __init__(self, upscale_factors=[16, 16]): """UpsamplingNet. It consists of several layers of Conv2DTranspose. Each Conv2DTranspose layer upsamples the time dimension by its `stride` times. And each Conv2DTranspose's filter_size at frequency dimension is 3. Args: upscale_factors (list[int], optional): time upsampling factors for each Conv2DTranspose Layer. The `UpsampleNet` contains len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor is used as the `stride` for the corresponding Conv2DTranspose. Defaults to [16, 16]. Note: np.prod(upscale_factors) should equals the `hop_length` of the stft transformation used to extract spectrogram features from audios. For example, 16 * 16 = 256, then the spectram extracted using a stft transformation whose `hop_length` is 256. See `librosa.stft` for more details. """ super(UpsampleNet, self).__init__() self.upscale_factors = list(upscale_factors) self.upsample_convs = dg.LayerList() for i, factor in enumerate(upscale_factors): self.upsample_convs.append( Conv2DTranspose(1, 1, filter_size=(3, 2 * factor), stride=(1, factor), padding=(1, factor // 2)))
def __init__(self, n_loop, n_layer, residual_channels, condition_dim, filter_size): """The residual network in wavenet. It consists of `n_layer` stacks, each of which consists of `n_loop` ResidualBlocks. Args: n_loop (int): number of ResidualBlocks in a stack. n_layer (int): number of stacks in the `ResidualNet`. residual_channels (int): channels of each `ResidualBlock`'s input. condition_dim (int): channels of the condition. filter_size (int): filter size of the internal Conv1DCell of each `ResidualBlock`. """ super(ResidualNet, self).__init__() # double the dilation at each layer in a loop(n_loop layers) dilations = [2**i for i in range(n_loop)] * n_layer self.context_size = 1 + sum(dilations) self.residual_blocks = dg.LayerList([ ResidualBlock(residual_channels, condition_dim, filter_size, dilation) for dilation in dilations ])
def __init__(self, emb_dim, num_filters=10, num_channels=1, win_size_list=None, use_cudnn=True, ): super(TextCNNLayer, self).__init__() if win_size_list is None: win_size_list = [3] def gen_conv_pool(win_size): """生成指定窗口的卷积池化层 """ return ConvPoolLayer( num_channels, num_filters, [win_size, emb_dim], padding=[1, 0], use_cudnn=use_cudnn, ) self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list])
def __init__(self, num_channels=3, block_expansion=64, num_blocks=4, max_features=512, sn=False, use_kp=False, num_kp=10, kp_variance=0.01, **kwargs): super(Discriminator, self).__init__() down_blocks = [] for i in range(num_blocks): down_blocks.append( DownBlock2d(num_channels + num_kp * use_kp if i == 0 else min( max_features, block_expansion * (2**i)), min(max_features, block_expansion * (2**(i + 1))), norm=(i != 0), kernel_size=4, pool=(i != num_blocks - 1), sn=sn)) self.down_blocks = dygraph.LayerList(down_blocks) self.conv = dygraph.Conv2D( self.down_blocks[len(self.down_blocks) - 1].conv.parameters()[0].shape[0], 1, filter_size=1) if sn: self.sn = dygraph.SpectralNorm(self.conv.parameters()[0].shape, dim=0) else: self.sn = None self.use_kp = use_kp self.kp_variance = kp_variance
def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = dg.LayerList( dg.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def __init__(self, n_speakers, speaker_dim, in_channels, linear_dim, convolutions=(ConvSpec(256, 5, 1), ) * 4, time_upsampling=1, dropout=0.0): """Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform. Args: n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. in_channels (int): channels of the input. linear_dim (int): channels of the linear spectrogram. convolutions (Iterable[ConvSpec], optional): specifications of the internal convolutional layers. ConvSpec is a namedtuple of (output_channels, filter_size, dilation) Defaults to (ConvSpec(256, 5, 1), )*4. time_upsampling (int, optional): time upsampling factor of the converter, possible options are {1, 2, 4}. Note that this should equals the downsample factor of the mel spectrogram. Defaults to 1. dropout (float, optional): dropout probability. Defaults to 0.0. """ super(Converter, self).__init__() self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.in_channels = in_channels self.linear_dim = linear_dim # CAUTION: this should equals the downsampling steps coefficient self.time_upsampling = time_upsampling self.dropout = dropout target_channels = convolutions[0].out_channels # conv proj to target channels self.first_conv_proj = Conv1D( in_channels, target_channels, 1, param_attr=I.Normal(scale=np.sqrt(1 / in_channels))) # Idea from nyanko if time_upsampling == 4: self.upsampling_convolutions = dg.LayerList( upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout)) elif time_upsampling == 2: self.upsampling_convolutions = dg.LayerList( upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout)) elif time_upsampling == 1: self.upsampling_convolutions = dg.LayerList( upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout)) else: raise ValueError( "Upsampling factors other than {1, 2, 4} are Not supported.") # post conv layers std_mul = 4.0 in_channels = target_channels self.convolutions = dg.LayerList() for (out_channels, filter_size, dilation) in convolutions: if in_channels != out_channels: std = np.sqrt(std_mul / in_channels) # CAUTION: relu self.convolutions.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation=dilation, std_mul=std_mul, dropout=dropout)) in_channels = out_channels std_mul = 4.0 # final conv proj, channel transformed to linear dim std = np.sqrt(std_mul * (1 - dropout) / in_channels) # CAUTION: sigmoid self.last_conv_proj = Conv1D(in_channels, linear_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
def __init__(self, n_speakers, speaker_dim, embed_dim, mel_dim, r=1, max_positions=512, preattention=(ConvSpec(128, 5, 1), ) * 4, convolutions=(ConvSpec(128, 5, 1), ) * 4, attention=True, dropout=0.0, use_memory_mask=False, force_monotonic_attention=False, query_position_rate=1.0, key_position_rate=1.0, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True): """Decoder of the Deep Voice 3 model. Args: n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. embed_dim (int): text embedding size. mel_dim (int): channel of mel input.(mel bands) r (int, optional): number of frames generated per decoder step. Defaults to 1. max_positions (int, optional): max position for text and decoder steps. Defaults to 512. convolutions (Iterable[ConvSpec], optional): specification of causal convolutional layers inside the decoder. ConvSpec is a namedtuple of output_channels, filter_size and dilation. Defaults to (ConvSpec(128, 5, 1), )*4. attention (bool or List[bool], optional): whether to use attention, it should have the same length with `convolutions` if it is a list of bool, indicating whether to have an Attention layer coupled with the corresponding convolutional layer. If it is a bool, it is repeated len(convolutions) times internally. Defaults to True. dropout (float, optional): dropout probability. Defaults to 0.0. use_memory_mask (bool, optional): whether to use memory mask at the Attention layer. It should have the same length with `attention` if it is a list of bool, indicating whether to use memory mask at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False. force_monotonic_attention (bool, optional): whether to use monotonic_attention at the Attention layer when inferencing. It should have the same length with `attention` if it is a list of bool, indicating whether to use monotonic_attention at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False. query_position_rate (float, optional): position_rate of the PositionEmbedding for query. Defaults to 1.0. key_position_rate (float, optional): position_rate of the PositionEmbedding for key. Defaults to 1.0. window_range (WindowRange, optional): window range of monotonic attention. Defaults to WindowRange(-1, 3). key_projection (bool, optional): `key_projection` of Attention layers. Defaults to True. value_projection (bool, optional): `value_projection` of Attention layers Defaults to True. """ super(Decoder, self).__init__() self.dropout = dropout self.mel_dim = mel_dim self.r = r self.query_position_rate = query_position_rate self.key_position_rate = key_position_rate self.window_range = window_range self.n_speakers = n_speakers conv_channels = convolutions[0].out_channels # only when padding idx is 0 can we easilt handle it self.embed_keys_positions = PositionEmbedding(max_positions, embed_dim) self.embed_query_positions = PositionEmbedding(max_positions, conv_channels) if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) self.speaker_proj1 = Linear(speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std)) self.speaker_proj2 = Linear(speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std)) # prenet self.prenet = dg.LayerList() in_channels = mel_dim * r # multiframe std_mul = 1.0 for (out_channels, filter_size, dilation) in preattention: if in_channels != out_channels: # conv1d & relu std = np.sqrt(std_mul / in_channels) self.prenet.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.prenet.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=True, residual=True)) in_channels = out_channels std_mul = 4.0 # attention self.use_memory_mask = use_memory_mask if isinstance(attention, bool): self.attention = [attention] * len(convolutions) else: self.attention = attention if isinstance(force_monotonic_attention, bool): self.force_monotonic_attention = [force_monotonic_attention ] * len(convolutions) else: self.force_monotonic_attention = force_monotonic_attention for x, y in zip(self.force_monotonic_attention, self.attention): if x is True and y is False: raise ValueError("When not using attention, there is no " "monotonic attention at all") # causual convolution & attention self.conv_attn = [] for use_attention, (out_channels, filter_size, dilation) in zip(self.attention, convolutions): assert ( in_channels == out_channels ), "the stack of convolution & attention does not change channels" conv_layer = Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=True, residual=False) attn_layer = Attention( out_channels, embed_dim, dropout, window_range, key_projection=key_projection, value_projection=value_projection) if use_attention else None in_channels = out_channels std_mul = 4.0 self.conv_attn.append((conv_layer, attn_layer)) for i, (conv_layer, attn_layer) in enumerate(self.conv_attn): self.add_sublayer("conv_{}".format(i), conv_layer) if attn_layer is not None: self.add_sublayer("attn_{}".format(i), attn_layer) # 1 * 1 conv to transform channels std = np.sqrt(std_mul * (1 - dropout) / in_channels) self.last_conv = Conv1D(in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std)) # mel (before sigmoid) to done hat std = np.sqrt(1 / in_channels) self.fc = Conv1D(mel_dim * r, 1, 1, param_attr=I.Normal(scale=std)) # decoding configs self.max_decoder_steps = 200 self.min_decoder_steps = 10 assert convolutions[-1].out_channels % r == 0, \ "decoder_state dim must be divided by r" self.state_dim = convolutions[-1].out_channels // self.r
def __init__(self, n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=0.1, convolutions=(ConvSpec(64, 5, 1), ) * 7, dropout=0.): """Encoder of Deep Voice 3. Args: n_vocab (int): vocabulary size of the text embedding. embed_dim (int): embedding size of the text embedding. n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. padding_idx (int, optional): padding index of text embedding. Defaults to None. embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1. convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7. dropout (float, optional): dropout probability. Defaults to 0.. """ super(Encoder, self).__init__() self.embedding_weight_std = embedding_weight_std self.embed = dg.Embedding( (n_vocab, embed_dim), padding_idx=padding_idx, param_attr=I.Normal(scale=embedding_weight_std)) self.dropout = dropout if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) self.sp_proj1 = Linear(speaker_dim, embed_dim, act="softsign", param_attr=I.Normal(scale=std)) self.sp_proj2 = Linear(speaker_dim, embed_dim, act="softsign", param_attr=I.Normal(scale=std)) self.n_speakers = n_speakers self.convolutions = dg.LayerList() in_channels = embed_dim std_mul = 1.0 for (out_channels, filter_size, dilation) in convolutions: # 1 * 1 convolution & relu if in_channels != out_channels: std = np.sqrt(std_mul / in_channels) self.convolutions.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=False, residual=True)) in_channels = out_channels std_mul = 4.0 std = np.sqrt(std_mul * (1 - dropout) / in_channels) self.convolutions.append( Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))