def __init__(self, dim=300, K=65536, m=0.999, T=0.07, mlp=False): """ dim: feature dimension (default: 128) K: queue size; number of negative keys (default: 65536) m: moco momentum of updating key encoder (default: 0.999) T: softmax temperature (default: 0.07) """ super(MoCo, self).__init__() self.K = K self.m = m self.T = T # create the encoders self.encoder_q = ErnieModelForSequenceClassification.from_pretrained('ernie-2.0-large-en', num_labels=dim) self.encoder_k = ErnieModelForSequenceClassification.from_pretrained('ernie-2.0-large-en', num_labels=dim) if mlp: dim_mlp = 1024 self.encoder_q.classifier = D.Sequential(D.Linear(dim_mlp, dim_mlp, act='relu'), self.encoder_q.classifier) self.encoder_k.classifier = D.Sequential(D.Linear(dim_mlp, dim_mlp,act='relu'), self.encoder_k.classifier) for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k=param_q # initialize param_k.requires_grad = False # not update by gradient # create the queue self.queue = L.randn([dim, K]) self.queue = norm(self.queue, dim=0) self.queue_ptr = L.zeros([1], dtype='int32')
def __init__(self, num_class, vocab_size, emb_dim=128, gru_dim=256, fc_hid_dim=256, is_sparse=True, bi_direction=True, ): super(GRU, self).__init__() self.bi_direction = bi_direction self.embedding = D.Embedding( size=[vocab_size, emb_dim], dtype='float32', #param_attr=F.ParamAttr(learning_rate=30), is_sparse=is_sparse) self._hid_fc1 = D.Linear(input_dim=emb_dim, output_dim=gru_dim * 3) self._gru_forward = DynamicGRU(size=gru_dim, h_0=None, is_reverse=False) if bi_direction: self._gru_backward = DynamicGRU(size=gru_dim, h_0=None, is_reverse=True) self._hid_fc2 = D.Linear(input_dim=gru_dim * 2, output_dim=fc_hid_dim, act="tanh") else: self._hid_fc2 = D.Linear(input_dim=gru_dim, output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): """Prenet before passing through the network. Args: input_size (int): the input channel size. hidden_size (int): the size of hidden layer in network. output_size (int): the output channel size. dropout_rate (float, optional): dropout probability. Defaults to 0.2. """ super(PreNet, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.dropout_rate = dropout_rate k = math.sqrt(1.0 / input_size) self.linear1 = dg.Linear( input_size, hidden_size, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k))) k = math.sqrt(1.0 / hidden_size) self.linear2 = dg.Linear( hidden_size, output_size, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k)))
def __init__(self, in_channels, reduction_factor, prenet_sizes, layers, kernel_size, attention_dim, position_encoding_weight=1., omega=1., has_bias=False, bias_dim=0, keep_prob=1.): super(Decoder, self).__init__() # prenet-mind the difference of AffineBlock2 and AffineBlock1 c_in = in_channels self.prenet = dg.LayerList() for i, c_out in enumerate(prenet_sizes): affine = AffineBlock2(c_in, c_out, has_bias, bias_dim, dropout=(i != 0), keep_prob=keep_prob) self.prenet.append(affine) c_in = c_out # causal convolutions + multihop attention decoder_dim = prenet_sizes[-1] self.causal_convs = dg.LayerList() self.attention_blocks = dg.LayerList() for i in range(layers): conv = ConvBlock(decoder_dim, kernel_size, True, has_bias, bias_dim, keep_prob) attn = AttentionBlock(attention_dim, decoder_dim, position_encoding_weight, omega, reduction_factor, has_bias, bias_dim, keep_prob) self.causal_convs.append(conv) self.attention_blocks.append(attn) # output mel spectrogram output_dim = reduction_factor * in_channels # r * mel_dim std = np.sqrt(1.0 / decoder_dim) initializer = I.NormalInitializer(loc=0., scale=std) out_affine = dg.Linear(decoder_dim, output_dim, param_attr=initializer) self.out_affine = weight_norm(out_affine, dim=-1) if has_bias: self.out_sp_affine = dg.Linear(bias_dim, output_dim) self.has_bias = has_bias self.kernel_size = kernel_size self.in_channels = in_channels self.decoder_dim = decoder_dim self.reduction_factor = reduction_factor self.out_channels = output_dim
def __init__(self, num_features, num_classes, epsilon=1e-5, momentum=0.1): super().__init__() self.bn_in_cond = BatchNorm(num_features, affine=False, epsilon=epsilon, momentum=momentum) self.gamma_embed = SpectralNorm( dg.Linear(num_classes, num_features, bias_attr=False)) self.beta_embed = SpectralNorm( dg.Linear(num_classes, num_features, bias_attr=False))
def __init__(self, in_channel, out_channel, has_bias=False, bias_dim=0): super(AffineBlock1, self).__init__() std = np.sqrt(1.0 / in_channel) initializer = I.NormalInitializer(loc=0., scale=std) affine = dg.Linear(in_channel, out_channel, param_attr=initializer) self.affine = weight_norm(affine, dim=-1) if has_bias: self.bias_affine = dg.Linear(bias_dim, out_channel) self.has_bias = has_bias self.bias_dim = bias_dim
def __init__(self, name=None, num=None): super(TSNResNet, self).__init__() self.convbn = convbn(3, 16) self.convpools = dygraph.Sequential(convpool(16, 32, pooling=4), convpool(32, 64, pooling=4), convpool(64, 128)) self.fcs = dygraph.Sequential( dygraph.Linear(7 * 7 * 128, 1024, act='relu'), dygraph.BatchNorm(1024), dygraph.Dropout(0.5), dygraph.Linear(1024, 101, act='softmax')) self.seg_num = 32
def __init__(self, mlp_head_dim, num_classes, num_bbox_reg_classes, roi_size, roi_spatial_scale, roi_sampling_ratio): super().__init__() in_dim = 256 # FPN output dimension self.mlp_head_dim = mlp_head_dim self.roi_size = roi_size self.roi_spatial_scale = roi_spatial_scale self.roi_sampling_ratio = roi_sampling_ratio self.fc6 = dg.Linear(in_dim * roi_size * roi_size, mlp_head_dim) self.fc7 = dg.Linear(mlp_head_dim, mlp_head_dim) self.cls_score = dg.Linear(mlp_head_dim, num_classes) self.bbox_pred = dg.Linear(mlp_head_dim, num_bbox_reg_classes * 4)
def __init__(self, in_channel, out_channel, has_bias=False, bias_dim=0, dropout=False, keep_prob=1.): super(AffineBlock2, self).__init__() if has_bias: std = np.sqrt(1 / bias_dim) self.bias_affine = dg.Linear(bias_dim, in_channel, param_attr=I.Normal(scale=std)) std = np.sqrt(1.0 / in_channel) affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std)) self.affine = weight_norm(affine, dim=-1) self.has_bias = has_bias self.bias_dim = bias_dim self.dropout = dropout self.keep_prob = keep_prob
def __init__(self): super(MNIST, self).__init__() self.cnn = dy.Conv2D(num_channels=3, num_filters=1, filter_size=3, stride=1, padding=1, act='relu') self.cls = dy.Sequential( dy.Linear(input_dim=784, output_dim=128), dy.Dropout(p=.2), dy.Linear(input_dim=128, output_dim=5), )
def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False): """ Initializes the model. Parameters: backbone: See backbone.py transformer: See transformer.py num_classes: number of object classes num_queries: number of object queries, ie the detection slot. This is the maximal number of objects DETR can detect in a single image. For COCO, we recommend 100 queries. aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. """ super().__init__() self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.class_embed = dg.Linear(hidden_dim, num_classes + 1) self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) self.query_embed = dg.Embedding((num_queries, hidden_dim)) self.input_proj = dg.Conv2D(backbone.num_channels, hidden_dim, filter_size=1) self.backbone = backbone self.aux_loss = aux_loss
def __init__(self, code_dim=128, n_class=1000, chn=96, blocks_with_attention="B4", resolution=512): super().__init__() def GBlock(in_channel, out_channel, n_class, z_dim, use_attention): return ResBlock(in_channel, out_channel, n_class=n_class, z_dim=z_dim, use_attention=use_attention) self.embed_y = dg.Linear(n_class, 128, bias_attr=False) self.chn = chn self.resolution = resolution self.blocks_with_attention = set(blocks_with_attention.split(",")) self.blocks_with_attention.discard('') gblock = [] in_channels, out_channels = self.get_in_out_channels() self.num_split = len(in_channels) + 1 z_dim = code_dim // self.num_split + 128 self.noise_fc = SpectralNorm( dg.Linear(code_dim // self.num_split, 4 * 4 * in_channels[0])) self.sa_ids = [ int(s.split('B')[-1]) for s in self.blocks_with_attention ] for i, (nc_in, nc_out) in enumerate(zip(in_channels, out_channels)): gblock.append( GBlock(nc_in, nc_out, n_class=n_class, z_dim=z_dim, use_attention=(i + 1) in self.sa_ids)) self.blocks = dg.LayerList(gblock) self.output_layer_bn = BatchNorm(1 * chn, epsilon=1e-5) self.output_layer_conv = SpectralNorm( dg.Conv2D(1 * chn, 3, [3, 3], padding=1))
def _build_linear(n_in, n_out, name, init, act=None): return D.Linear(n_in, n_out, param_attr=F.ParamAttr(name='%s.w_0' % name if name is not None else None, initializer=init), bias_attr='%s.b_0' % name if name is not None else None, act=act)
def __init__(self, cfg, num_mels=80): """FastSpeech model. Args: cfg: the yaml configs used in FastSpeech model. num_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80. """ super(FastSpeech, self).__init__() self.encoder = Encoder( n_src_vocab=len(symbols) + 1, len_max_seq=cfg['max_seq_len'], n_layers=cfg['encoder_n_layer'], n_head=cfg['encoder_head'], d_k=cfg['hidden_size'] // cfg['encoder_head'], d_q=cfg['hidden_size'] // cfg['encoder_head'], d_model=cfg['hidden_size'], d_inner=cfg['encoder_conv1d_filter_size'], fft_conv1d_kernel=cfg['fft_conv1d_filter'], fft_conv1d_padding=cfg['fft_conv1d_padding'], dropout=0.1) self.length_regulator = LengthRegulator( input_size=cfg['hidden_size'], out_channels=cfg['duration_predictor_output_size'], filter_size=cfg['duration_predictor_filter_size'], dropout=cfg['dropout']) self.decoder = Decoder( len_max_seq=cfg['max_seq_len'], n_layers=cfg['decoder_n_layer'], n_head=cfg['decoder_head'], d_k=cfg['hidden_size'] // cfg['decoder_head'], d_q=cfg['hidden_size'] // cfg['decoder_head'], d_model=cfg['hidden_size'], d_inner=cfg['decoder_conv1d_filter_size'], fft_conv1d_kernel=cfg['fft_conv1d_filter'], fft_conv1d_padding=cfg['fft_conv1d_padding'], dropout=0.1) self.weight = fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()) k = math.sqrt(1.0 / cfg['hidden_size']) self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-k, high=k)) self.mel_linear = dg.Linear( cfg['hidden_size'], num_mels * cfg['outputs_per_step'], param_attr=self.weight, bias_attr=self.bias, ) self.postnet = PostConvNet( n_mels=num_mels, num_hidden=512, filter_size=5, padding=int(5 / 2), num_conv=5, outputs_per_step=cfg['outputs_per_step'], use_cudnn=True, dropout=0.1, batchnorm_last=True)
def __init__(self, num_class, vocab_size, emb_dim=32, num_filters=10, fc_hid_dim=32, num_channels=1, win_size_list=None, is_sparse=True, use_cudnn=True, ): super(TextCNN, self).__init__() self.embedding = D.Embedding( size=[vocab_size, emb_dim], dtype='float32', is_sparse=is_sparse) logging.info("num_class = {}".format(num_class)) logging.info("vocab size = {}".format(vocab_size)) logging.info("emb_dim = {}".format(emb_dim)) logging.info("num filters = {}".format(num_filters)) logging.info("fc_hid_dim = {}".format(fc_hid_dim)) logging.info("num channels = {}".format(num_channels)) logging.info("windows size = {}".format(win_size_list)) logging.info("is sparse = {}".format(is_sparse)) logging.info("use cudnn = {}".format(use_cudnn)) win_size_list = [3] if win_size_list is None else win_size_list def gen_conv_pool(win_size): """生成指定窗口的卷积池化层 """ return ConvPool( num_channels, num_filters, [win_size, emb_dim], padding=[1, 0], use_cudnn=use_cudnn, ) self.conv_pool_list = D.LayerList([gen_conv_pool(win_size) for win_size in win_size_list]) self._hid_fc = D.Linear(input_dim=num_filters * len(win_size_list), output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def Linear(input_dim, output_dim, param_attr=None, bias_attr=None, act=None, dtype="float32"): # a weight norm applied linear layer. lin = dg.Linear(input_dim, output_dim, param_attr, bias_attr, act, dtype) lin = WeightNormWrapper(lin, dim=1) return lin
def __init__(self, layers, in_channels, postnet_dim, kernel_size, out_channels, upsample_factor, has_bias=False, bias_dim=0, keep_prob=1.): super(PostNet, self).__init__() self.pre_affine = AffineBlock1(in_channels, postnet_dim, has_bias, bias_dim) self.convs = dg.LayerList([ ConvBlock(postnet_dim, kernel_size, False, has_bias, bias_dim, keep_prob) for _ in range(layers) ]) std = np.sqrt(1.0 / postnet_dim) post_affine = dg.Linear(postnet_dim, out_channels, param_attr=I.Normal(scale=std)) self.post_affine = weight_norm(post_affine, dim=-1) self.upsample_factor = upsample_factor
def __init__(self, n_class=1000, chn=96, blocks_with_attention="B2", resolution=256): super().__init__() def DBlock(in_channel, out_channel, downsample=True, use_attention=False, skip_proj=None): return ResBlock(in_channel, out_channel, conditional=False, upsample=False, downsample=downsample, use_attention=use_attention, skip_proj=skip_proj) self.chn = chn self.colors = 3 self.resolution = resolution self.blocks_with_attention = set(blocks_with_attention.split(",")) self.blocks_with_attention.discard('') dblock = [] in_channels, out_channels = self.get_in_out_channels() self.sa_ids = [ int(s.split('B')[-1]) for s in self.blocks_with_attention ] for i, (nc_in, nc_out) in enumerate(zip(in_channels[:-1], out_channels[:-1])): dblock.append( DBlock(nc_in, nc_out, downsample=True, use_attention=(i + 1) in self.sa_ids, skip_proj=nc_in == nc_out)) dblock.append( DBlock(in_channels[-1], out_channels[-1], downsample=False, use_attention=len(out_channels) in self.sa_ids, skip_proj=in_channels[-1] == out_channels[-1])) self.blocks = dg.LayerList(dblock) self.final_fc = SpectralNorm(dg.Linear(16 * chn, 1)) self.embed_y = dg.Embedding(size=[n_class, 16 * chn], is_sparse=False, param_attr=Uniform(-0.1, 0.1)) self.embed_y = SpectralNorm(self.embed_y)
def __init__(self, num_class, vocab_size, emb_dim=128, lstm_dim=256, fc_hid_dim=256, is_sparse=True, bi_direction=True, dropout_prob=0.1, ): super(DynamicLSTMClassifier, self).__init__() logging.info("num_class = {}".format(num_class)) logging.info("vocab_size = {}".format(vocab_size)) logging.info("emb_dim = {}".format(emb_dim)) logging.info("lstm_dim = {}".format(lstm_dim)) logging.info("fc_hid_dim = {}".format(fc_hid_dim)) logging.info("is_sparse = {}".format(is_sparse)) logging.info("bi_direction = {}".format(bi_direction)) logging.info("dropout_prob = {}".format(dropout_prob)) self.bi_direction = bi_direction self.embedding = EmbeddingLayer( vocab_size=vocab_size, emb_dim=emb_dim, dtype='float32', is_sparse=is_sparse) self._lstm_forward = DynamicLSTMLayer(input_size=emb_dim, hidden_size=lstm_dim, is_reverse=False) if bi_direction: self._lstm_backward = DynamicLSTMLayer(input_size=emb_dim, hidden_size=lstm_dim, is_reverse=True) self._hid_fc2 = D.Linear(input_dim=lstm_dim * 2, output_dim=fc_hid_dim, act="tanh") else: self._hid_fc2 = D.Linear(input_dim=lstm_dim, output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None) self.dropout = lambda i: L.dropout(i, dropout_prob=dropout_prob, dropout_implementation="upscale_in_train") if self.training else i
def __init__(self, n_in, n_out, dropout=0): super(MLP, self).__init__() self.n_in = n_in self.n_out = n_out self.linear = dygraph.Linear( n_in, n_out, param_attr=initializer.Xavier(uniform=False), bias_attr=None, ) self.dropout = SharedDropout(p=dropout)
def __init__(self, num_class, vocab_size, emb_dim=32, num_filters=10, fc_hid_dim=32, num_channels=1, win_size_list=None, is_sparse=True, use_cudnn=True, ): super(TextCNNClassifier, self).__init__() self.embedding = EmbeddingLayer( vocab_size=vocab_size, emb_dim=emb_dim, dtype='float32', is_sparse=is_sparse, ) self.textcnn = TextCNNLayer( emb_dim, num_filters, num_channels, win_size_list, use_cudnn, ) logging.info("num_class = {}".format(num_class)) logging.info("vocab size = {}".format(vocab_size)) logging.info("emb_dim = {}".format(emb_dim)) logging.info("num filters = {}".format(num_filters)) logging.info("fc_hid_dim = {}".format(fc_hid_dim)) logging.info("num channels = {}".format(num_channels)) logging.info("win size list = {}".format(win_size_list)) logging.info("is sparse = {}".format(is_sparse)) logging.info("use cudnn = {}".format(use_cudnn)) self._hid_fc = D.Linear(input_dim=num_filters * len(win_size_list), output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def __init__(self, num_class, vocab_size, emb_dim=128, gru_dim=256, fc_hid_dim=256, is_sparse=True, bi_direction=True, ): super(GRUClassifier, self).__init__() logging.info("num_class = {}".format(num_class)) logging.info("vocab_size = {}".format(vocab_size)) logging.info("emb_dim = {}".format(emb_dim)) logging.info("gru_dim = {}".format(gru_dim)) logging.info("fc_hid_dim = {}".format(fc_hid_dim)) logging.info("is_sparse = {}".format(is_sparse)) logging.info("bi_direction = {}".format(bi_direction)) self.bi_direction = bi_direction self.embedding = EmbeddingLayer( vocab_size=vocab_size, emb_dim=emb_dim, dtype='float32', #param_attr=F.ParamAttr(learning_rate=30), is_sparse=is_sparse) self._hid_fc1 = D.Linear(input_dim=emb_dim, output_dim=gru_dim * 3) self._gru_forward = DynamicGRULayer(size=gru_dim, h_0=None, is_reverse=False) if bi_direction: self._gru_backward = DynamicGRULayer(size=gru_dim, h_0=None, is_reverse=True) self._hid_fc2 = D.Linear(input_dim=gru_dim * 2, output_dim=fc_hid_dim, act="tanh") else: self._hid_fc2 = D.Linear(input_dim=gru_dim, output_dim=fc_hid_dim, act="tanh") self._output_fc = D.Linear(input_dim=fc_hid_dim, output_dim=num_class, act=None)
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None): super(ResNet, self).__init__() if norm_layer is None: norm_layer = dg.BatchNorm self._norm_layer = norm_layer self.inplanes = 64 self.dilation = 1 if replace_stride_with_dilation is None: # each element in the tuple indicates if we should replace # the 2x2 stride with a dilated convolution instead replace_stride_with_dilation = [False, False, False] if len(replace_stride_with_dilation) != 3: raise ValueError("replace_stride_with_dilation should be None " "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) self.groups = groups self.base_width = width_per_group self.conv1 = dg.Conv2D(3, self.inplanes, filter_size=7, stride=2, padding=3, bias_attr=False) self.bn1 = norm_layer(self.inplanes) self.relu = ReLU() self.maxpool = dg.Pool2D(pool_size=3, pool_type='max', pool_stride=2, pool_padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]) self.avgpool = lambda x: L.adaptive_pool2d(x, (1, 1), pool_type='avg') self.fc = dg.Linear(512 * block.expansion, num_classes) for m in self.sublayers(): if isinstance(m, dg.Conv2D): m.param_attr = F.ParamAttr(initializer=F.initializer.MSRAInitializer()) elif isinstance(m, (dg.BatchNorm, dg.GroupNorm)): m.param_attr = F.ParamAttr(initializer=F.initializer.ConstantInitializer(value=0.0)) m.bias_attr = F.ParamAttr(initializer=F.initializer.ConstantInitializer(value=0.0)) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.sublayers(): if isinstance(m, Bottleneck): m.bn3.param_attr = F.ParamAttr(initializer=F.initializer.ConstantInitializer(value=0.0)) elif isinstance(m, BasicBlock): m.bn2.param_attr = F.ParamAttr(initializer=F.initializer.ConstantInitializer(value=0.0))
def __init__(self, input_size, out_channels, filter_size, dropout=0.1): """Duration Predictor block in FastSpeech. Args: input_size (int): the channel number of input. out_channels (int): the output channel number. filter_size (int): the filter size. dropout (float, optional): dropout probability. Defaults to 0.1. """ super(DurationPredictor, self).__init__() self.input_size = input_size self.out_channels = out_channels self.filter_size = filter_size self.dropout = dropout k = math.sqrt(1.0 / self.input_size) self.conv1 = Conv1D( num_channels=self.input_size, num_filters=self.out_channels, filter_size=self.filter_size, padding=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k))) #data_format='NTC') k = math.sqrt(1.0 / self.out_channels) self.conv2 = Conv1D( num_channels=self.out_channels, num_filters=self.out_channels, filter_size=self.filter_size, padding=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k))) #data_format='NTC') self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels) self.weight = fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()) k = math.sqrt(1.0 / self.out_channels) self.bias = fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-k, high=k)) self.linear = dg.Linear(self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
def __init__(self, num_units, num_layers=4): """Highway network Args: num_units (int): dimension of hidden unit. num_layers (int, optional): number of highway layers. Defaults to 4. """ super(Highwaynet, self).__init__() self.num_units = num_units self.num_layers = num_layers self.gates = [] self.linears = [] k = math.sqrt(1.0 / num_units) for i in range(num_layers): self.linears.append( dg.Linear( num_units, num_units, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-k, high=k)))) self.gates.append( dg.Linear( num_units, num_units, param_attr=fluid.ParamAttr( initializer=fluid.initializer.XavierInitializer()), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-k, high=k)))) for i, (linear, gate) in enumerate(zip(self.linears, self.gates)): self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("gates_{}".format(i), gate)
def __init__(self): super(HarFcn, self).__init__() self.cnn1 = dy.Sequential( dy.Conv2D(num_channels=1, num_filters=128, filter_size=3, stride=1, padding=1), dy.BatchNorm(num_channels=128), dy.Dropout(p=.2), ) self.cnn2 = dy.Sequential( dy.Conv2D(num_channels=128, num_filters=128, filter_size=3, stride=1, padding=1), dy.BatchNorm(num_channels=128), dy.Dropout(p=.2), ) self.cnn3 = dy.Sequential( dy.Conv2D(num_channels=128, num_filters=128, filter_size=3, stride=1, padding=1), dy.BatchNorm(num_channels=128), dy.Dropout(p=.2), ) self.cls = dy.Sequential( dy.Linear(input_dim=384, output_dim=128), dy.Dropout(p=.2), dy.Linear(input_dim=128, output_dim=5), )
def __init__(self, in_channel, kernel_size, causal=False, has_bias=False, bias_dim=None, keep_prob=1.): super(ConvBlock, self).__init__() self.causal = causal self.keep_prob = keep_prob self.in_channel = in_channel self.has_bias = has_bias std = np.sqrt(4 * keep_prob / (kernel_size * in_channel)) padding = "valid" if causal else "same" conv = Conv1D(in_channel, 2 * in_channel, (kernel_size, ), padding=padding, data_format="NTC", param_attr=I.Normal(scale=std)) self.conv = weight_norm(conv) if has_bias: std = np.sqrt(1 / bias_dim) self.bias_affine = dg.Linear(bias_dim, 2 * in_channel, param_attr=I.Normal(scale=std))
def __init__(self, attention_dim, input_dim, position_encoding_weight=1., position_rate=1., reduction_factor=1, has_bias=False, bias_dim=0, keep_prob=1.): super(AttentionBlock, self).__init__() # positional encoding omega_default = position_rate / reduction_factor self.omega_default = omega_default # multispeaker case if has_bias: std = np.sqrt(1.0 / bias_dim) initializer = I.NormalInitializer(loc=0., scale=std) self.q_pos_affine = dg.Linear(bias_dim, 1, param_attr=initializer) self.k_pos_affine = dg.Linear(bias_dim, 1, param_attr=initializer) self.omega_initial = self.create_parameter( shape=[1], attr=I.ConstantInitializer(value=omega_default)) # mind the fact that q, k, v have the same feature dimension # so we can init k_affine and q_affine's weight as the same matrix # to get a better init attention init_weight = np.random.normal(size=(input_dim, attention_dim), scale=np.sqrt(1. / input_dim)) initializer = I.NumpyArrayInitializer(init_weight.astype(np.float32)) # 3 affine transformation to project q, k, v into attention_dim q_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer) self.q_affine = weight_norm(q_affine, dim=-1) k_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer) self.k_affine = weight_norm(k_affine, dim=-1) std = np.sqrt(1.0 / input_dim) initializer = I.NormalInitializer(loc=0., scale=std) v_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer) self.v_affine = weight_norm(v_affine, dim=-1) std = np.sqrt(1.0 / attention_dim) initializer = I.NormalInitializer(loc=0., scale=std) out_affine = dg.Linear(attention_dim, input_dim, param_attr=initializer) self.out_affine = weight_norm(out_affine, dim=-1) self.keep_prob = keep_prob self.has_bias = has_bias self.bias_dim = bias_dim self.attention_dim = attention_dim self.position_encoding_weight = position_encoding_weight
def _get_conv_layer(self, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, padding_mode, input_dim): # Returns the convolutional layer. if input_dim == 0: layer = dg.Linear(in_channels, out_channels, bias_attr=bias) else: layer_type = getattr(dg, 'Conv%dD' % input_dim) layer = layer_type( num_channels=in_channels, num_filters=out_channels, filter_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias_attr=bias, ) return layer
def __init__(self, in_channels, out_channels, gain=2**(0.5), use_wscale=False, lrmul=1.0, bias=True): """ The complete conversion of Dense/FC/Linear Layer of original Tensorflow version. """ super(FC, self).__init__() self.out_channels = out_channels he_std = gain * in_channels**(-0.5) # He init if use_wscale: # init_std = 1.0 / lrmul # self.w_lrmul = he_std * lrmul self.w_lrmul = lrmul else: # init_std = he_std / lrmul # self.w_lrmul = lrmul self.w_lrmul = 1.0 w = np.random.randn(in_channels, out_channels) * he_std * self.w_lrmul self.weight_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(w)) #self.weight = layers.create_parameter((in_channels,out_channels),'float32') if bias: self.b_lrmul = lrmul b = np.random.randn(out_channels) * self.b_lrmul self.bias_attr = fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(b)) # self.bias = layers.create_parameter((out_channels,),'float32') else: self.bias_attr = False self.linear = dygraph.Linear(in_channels, out_channels, param_attr=self.weight_attr, bias_attr=self.bias_attr, dtype='float32')