def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2=0.1): super().__init__() self.emb = in_dim * head_cnt # we use 1, so it is no need here self.kqv = nn.Linear(dim, 3 * self.emb) self.dp = nn.Dropout(dp1) self.proj = nn.Linear(self.emb, self.emb) self.head_cnt = head_cnt self.norm1 = nn.LayerNorm(dim) self.norm2 = nn.LayerNorm(self.emb) self.epsilon = 1e-8 # for stable in division self.mlp = nn.Sequential( nn.Linear(self.emb, 1 * self.emb), nn.GELU(), nn.Linear(1 * self.emb, self.emb), nn.Dropout(dp2), ) self.m = int(self.emb * kernel_ratio) self.w = paddle.randn((self.m, self.emb)) self.w = add_parameter(self, orthogonal_(self.w) * math.sqrt(self.m))
def __init__(self, in_size, out_size): super(SimpleModel, self).__init__() self.linear = nn.Linear(in_size, out_size) self.dropout_1 = paddle.nn.Dropout(0.1) self.relu = nn.ReLU() self.dropout_2 = paddle.nn.Dropout(0.5) self.gelu = nn.GELU()
def __init__(self, inplanes=256, planes=256, kernel_size=9, dilation=1, dropout_rate=0.1): super(ResnetBasicBlock, self).__init__() self.conv1 = nn.Conv1D(in_channels=inplanes, out_channels=planes, kernel_size=kernel_size, dilation=dilation, \ padding="same", data_format="NLC", weight_attr=nn.initializer.KaimingNormal()) self.bn1 = nn.BatchNorm1D(planes, data_format="NLC") self.gelu1 = nn.GELU() self.dropout1 = nn.Dropout(p=dropout_rate) self.conv2 = nn.Conv1D(in_channels=planes, out_channels=planes, kernel_size=kernel_size, dilation=dilation, \ padding="same", data_format="NLC", weight_attr=nn.initializer.KaimingNormal()) self.bn2 = nn.BatchNorm1D(planes, data_format="NLC") self.gelu2 = nn.GELU() self.dropout2 = nn.Dropout(p=dropout_rate)
def __init__(self, config): super(LayoutXLMIntermediate, self).__init__() self.dense = nn.Linear(config["hidden_size"], config["intermediate_size"]) if config["hidden_act"] == "gelu": self.intermediate_act_fn = nn.GELU() else: assert False, "hidden_act is set as: {}, please check it..".format( config["hidden_act"])
def __init__(self, n_head, hidden_size, attn_dropout, act_dropout): super(Plato2EncoderLayer, self).__init__() self.self_attn = nn.MultiHeadAttention(hidden_size, n_head, attn_dropout) self.pre_norm_layer = nn.LayerNorm(hidden_size) self.post_norm_layer = nn.LayerNorm(hidden_size) self.fc1 = nn.Linear(hidden_size, hidden_size * 4) self.fc2 = nn.Linear(hidden_size * 4, hidden_size) self.dropout_layer = nn.Dropout(act_dropout) self.gelu_layer = nn.GELU()
def __init__(self, distilbert): super(DistilBertForMaskedLM, self).__init__() self.distilbert = distilbert self.vocab_transform = nn.Linear(self.distilbert.config["hidden_size"], self.distilbert.config["hidden_size"]) self.activation = nn.GELU() self.vocab_layer_norm = nn.LayerNorm( self.distilbert.config["hidden_size"]) self.vocab_projector = nn.Linear(self.distilbert.config["hidden_size"], self.distilbert.config["vocab_size"]) self.apply(self.init_weights)
def __init__(self, embedding_size, vocab_size, hidden_size): super(ErnieCtmMLMHead, self).__init__() self.layer_norm = nn.LayerNorm(embedding_size) self.bias = self.create_parameter( [vocab_size], is_bias=True, default_initializer=nn.initializer.Constant(value=0.0)) self.dense = nn.Linear(hidden_size, embedding_size) self.decoder = nn.Linear(embedding_size, vocab_size) self.activation = nn.GELU(approximate=True) # Link bias self.decoder.bias = self.bias
def __init__(self, vocab_size, embedding_size=128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, pad_token_id=0, use_content_summary=True, content_summary_index=1, cls_num=2): super(ErnieCtmModel, self).__init__() self.pad_token_id = pad_token_id self.content_summary_index = content_summary_index self.initializer_range = initializer_range self.embeddings = ErnieCtmEmbeddings( vocab_size, embedding_size, hidden_dropout_prob=hidden_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, padding_idx=pad_token_id, cls_num=cls_num) self.embedding_hidden_mapping_in = nn.Linear(embedding_size, hidden_size) encoder_layer = nn.TransformerEncoderLayer( hidden_size, num_attention_heads, intermediate_size, dropout=hidden_dropout_prob, activation="gelu", attn_dropout=attention_probs_dropout_prob, act_dropout=0) encoder_layer.activation = nn.GELU(approximate=True) self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers) self.pooler = ErnieCtmPooler(hidden_size) self.use_content_summary = use_content_summary self.content_summary_index = content_summary_index if use_content_summary is True: self.feature_fuse = nn.Linear(hidden_size * 2, intermediate_size) self.feature_output = nn.Linear(intermediate_size, hidden_size) self.apply(self.init_weights)
def __init__(self, nsp_reader, num_layers, n_head, hidden_size, vocab_size=8001, type_size=2, latent_type_size=20, max_position_seq_len=256, act_dropout=0.1, attn_dropout=0.1, max_dec_len=64, min_dec_len=1, topk=10): super(Plato2InferModel, self).__init__() self.nsp_reader = nsp_reader self.num_layers = num_layers self.latent_type_size = latent_type_size self.max_dec_len = max_dec_len self.min_dec_len = min_dec_len self.topk = topk self.unk_id = 0 self.bos_id = 1 self.eos_id = 2 self.mask_id = 8000 self.after_eos = paddle.ones([vocab_size]) * -1e9 self.after_eos[self.eos_id] = 0 self.is_cn = False self.batch_size = 1 self.latent_weight = paddle.create_parameter( [hidden_size, latent_type_size], 'float32') self.plato2_encoder = Plato2Encoder( vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout, act_dropout) self.logits_fc_layer = nn.Linear(hidden_size, hidden_size) self.logits_layer_norm = nn.LayerNorm(hidden_size) self.logits_bias = paddle.create_parameter( [vocab_size], 'float32', is_bias=True) self.nsp_predictor = NSP(vocab_size, type_size, max_position_seq_len, num_layers, n_head, hidden_size, attn_dropout, act_dropout) self.gelu_layer = nn.GELU() self.softmax = nn.Softmax()
def __init__(self, vocab_size, emb_dim=128, hidden_size=256, kernel_size=9, n_layers=35, padding_idx=0, dropout_rate=0.1, epsilon=1e-6): super(ResnetEncoderModel, self).__init__() self.hidden_size = hidden_size self.n_layers = n_layers self.token_embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx) max_pos_len = 3000 self.pos_embedding = nn.Embedding(max_pos_len, emb_dim, padding_idx=padding_idx) self.layer_norm = nn.BatchNorm1D(emb_dim, data_format="NLC") self.dropout = nn.Dropout(dropout_rate) self.padded_conv = nn.Sequential( nn.Conv1D(in_channels=emb_dim, out_channels=hidden_size, kernel_size=kernel_size, padding="same", \ data_format="NLC", weight_attr=nn.initializer.KaimingNormal()), nn.BatchNorm1D(hidden_size, data_format="NLC"), nn.GELU(), nn.Dropout(p=dropout_rate) ) self.residual_block_1 = ResnetBasicBlock(inplanes=hidden_size, planes=hidden_size, kernel_size=kernel_size, dropout_rate=dropout_rate) self.residual_block_n = nn.Sequential() for i in range(1, n_layers): self.residual_block_n.add_sublayer("residual_block_%d" % i, \ ResnetBasicBlock(inplanes=hidden_size, planes=hidden_size, kernel_size=kernel_size, dilation=2, dropout_rate=dropout_rate)) self.apply(self.init_weights)
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False): super().__init__() assert dim % num_heads == 0 self.dim = dim self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 self.q = nn.Linear(dim, dim, bias_attr=qkv_bias) self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.linear = linear self.sr_ratio = sr_ratio if not linear: if sr_ratio > 1: self.sr = nn.Conv2D(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) self.norm = nn.LayerNorm(dim) else: self.pool = nn.AdaptiveAvgPool2D(7) self.sr = nn.Conv2D(dim, dim, kernel_size=1, stride=1) self.norm = nn.LayerNorm(dim) self.act = nn.GELU()
def func_test_layer_str(self): module = nn.ELU(0.2) self.assertEqual(str(module), 'ELU(alpha=0.2)') module = nn.CELU(0.2) self.assertEqual(str(module), 'CELU(alpha=0.2)') module = nn.GELU(True) self.assertEqual(str(module), 'GELU(approximate=True)') module = nn.Hardshrink() self.assertEqual(str(module), 'Hardshrink(threshold=0.5)') module = nn.Hardswish(name="Hardswish") self.assertEqual(str(module), 'Hardswish(name=Hardswish)') module = nn.Tanh(name="Tanh") self.assertEqual(str(module), 'Tanh(name=Tanh)') module = nn.Hardtanh(name="Hardtanh") self.assertEqual(str(module), 'Hardtanh(min=-1.0, max=1.0, name=Hardtanh)') module = nn.PReLU(1, 0.25, name="PReLU", data_format="NCHW") self.assertEqual( str(module), 'PReLU(num_parameters=1, data_format=NCHW, init=0.25, dtype=float32, name=PReLU)' ) module = nn.ReLU() self.assertEqual(str(module), 'ReLU()') module = nn.ReLU6() self.assertEqual(str(module), 'ReLU6()') module = nn.SELU() self.assertEqual( str(module), 'SELU(scale=1.0507009873554805, alpha=1.6732632423543772)') module = nn.LeakyReLU() self.assertEqual(str(module), 'LeakyReLU(negative_slope=0.01)') module = nn.Sigmoid() self.assertEqual(str(module), 'Sigmoid()') module = nn.Hardsigmoid() self.assertEqual(str(module), 'Hardsigmoid()') module = nn.Softplus() self.assertEqual(str(module), 'Softplus(beta=1, threshold=20)') module = nn.Softshrink() self.assertEqual(str(module), 'Softshrink(threshold=0.5)') module = nn.Softsign() self.assertEqual(str(module), 'Softsign()') module = nn.Swish() self.assertEqual(str(module), 'Swish()') module = nn.Tanhshrink() self.assertEqual(str(module), 'Tanhshrink()') module = nn.ThresholdedReLU() self.assertEqual(str(module), 'ThresholdedReLU(threshold=1.0)') module = nn.LogSigmoid() self.assertEqual(str(module), 'LogSigmoid()') module = nn.Softmax() self.assertEqual(str(module), 'Softmax(axis=-1)') module = nn.LogSoftmax() self.assertEqual(str(module), 'LogSoftmax(axis=-1)') module = nn.Maxout(groups=2) self.assertEqual(str(module), 'Maxout(groups=2, axis=1)') module = nn.Linear(2, 4, name='linear') self.assertEqual( str(module), 'Linear(in_features=2, out_features=4, dtype=float32, name=linear)' ) module = nn.Upsample(size=[12, 12]) self.assertEqual( str(module), 'Upsample(size=[12, 12], mode=nearest, align_corners=False, align_mode=0, data_format=NCHW)' ) module = nn.UpsamplingNearest2D(size=[12, 12]) self.assertEqual( str(module), 'UpsamplingNearest2D(size=[12, 12], data_format=NCHW)') module = nn.UpsamplingBilinear2D(size=[12, 12]) self.assertEqual( str(module), 'UpsamplingBilinear2D(size=[12, 12], data_format=NCHW)') module = nn.Bilinear(in1_features=5, in2_features=4, out_features=1000) self.assertEqual( str(module), 'Bilinear(in1_features=5, in2_features=4, out_features=1000, dtype=float32)' ) module = nn.Dropout(p=0.5) self.assertEqual(str(module), 'Dropout(p=0.5, axis=None, mode=upscale_in_train)') module = nn.Dropout2D(p=0.5) self.assertEqual(str(module), 'Dropout2D(p=0.5, data_format=NCHW)') module = nn.Dropout3D(p=0.5) self.assertEqual(str(module), 'Dropout3D(p=0.5, data_format=NCDHW)') module = nn.AlphaDropout(p=0.5) self.assertEqual(str(module), 'AlphaDropout(p=0.5)') module = nn.Pad1D(padding=[1, 2], mode='constant') self.assertEqual( str(module), 'Pad1D(padding=[1, 2], mode=constant, value=0.0, data_format=NCL)') module = nn.Pad2D(padding=[1, 0, 1, 2], mode='constant') self.assertEqual( str(module), 'Pad2D(padding=[1, 0, 1, 2], mode=constant, value=0.0, data_format=NCHW)' ) module = nn.ZeroPad2D(padding=[1, 0, 1, 2]) self.assertEqual(str(module), 'ZeroPad2D(padding=[1, 0, 1, 2], data_format=NCHW)') module = nn.Pad3D(padding=[1, 0, 1, 2, 0, 0], mode='constant') self.assertEqual( str(module), 'Pad3D(padding=[1, 0, 1, 2, 0, 0], mode=constant, value=0.0, data_format=NCDHW)' ) module = nn.CosineSimilarity(axis=0) self.assertEqual(str(module), 'CosineSimilarity(axis=0, eps=1e-08)') module = nn.Embedding(10, 3, sparse=True) self.assertEqual(str(module), 'Embedding(10, 3, sparse=True)') module = nn.Conv1D(3, 2, 3) self.assertEqual(str(module), 'Conv1D(3, 2, kernel_size=[3], data_format=NCL)') module = nn.Conv1DTranspose(2, 1, 2) self.assertEqual( str(module), 'Conv1DTranspose(2, 1, kernel_size=[2], data_format=NCL)') module = nn.Conv2D(4, 6, (3, 3)) self.assertEqual(str(module), 'Conv2D(4, 6, kernel_size=[3, 3], data_format=NCHW)') module = nn.Conv2DTranspose(4, 6, (3, 3)) self.assertEqual( str(module), 'Conv2DTranspose(4, 6, kernel_size=[3, 3], data_format=NCHW)') module = nn.Conv3D(4, 6, (3, 3, 3)) self.assertEqual( str(module), 'Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)') module = nn.Conv3DTranspose(4, 6, (3, 3, 3)) self.assertEqual( str(module), 'Conv3DTranspose(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)') module = nn.PairwiseDistance() self.assertEqual(str(module), 'PairwiseDistance(p=2.0)') module = nn.InstanceNorm1D(2) self.assertEqual(str(module), 'InstanceNorm1D(num_features=2, epsilon=1e-05)') module = nn.InstanceNorm2D(2) self.assertEqual(str(module), 'InstanceNorm2D(num_features=2, epsilon=1e-05)') module = nn.InstanceNorm3D(2) self.assertEqual(str(module), 'InstanceNorm3D(num_features=2, epsilon=1e-05)') module = nn.GroupNorm(num_channels=6, num_groups=6) self.assertEqual( str(module), 'GroupNorm(num_groups=6, num_channels=6, epsilon=1e-05)') module = nn.LayerNorm([2, 2, 3]) self.assertEqual( str(module), 'LayerNorm(normalized_shape=[2, 2, 3], epsilon=1e-05)') module = nn.BatchNorm1D(1) self.assertEqual( str(module), 'BatchNorm1D(num_features=1, momentum=0.9, epsilon=1e-05, data_format=NCL)' ) module = nn.BatchNorm2D(1) self.assertEqual( str(module), 'BatchNorm2D(num_features=1, momentum=0.9, epsilon=1e-05)') module = nn.BatchNorm3D(1) self.assertEqual( str(module), 'BatchNorm3D(num_features=1, momentum=0.9, epsilon=1e-05, data_format=NCDHW)' ) module = nn.SyncBatchNorm(2) self.assertEqual( str(module), 'SyncBatchNorm(num_features=2, momentum=0.9, epsilon=1e-05)') module = nn.LocalResponseNorm(size=5) self.assertEqual( str(module), 'LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=1.0)') module = nn.AvgPool1D(kernel_size=2, stride=2, padding=0) self.assertEqual(str(module), 'AvgPool1D(kernel_size=2, stride=2, padding=0)') module = nn.AvgPool2D(kernel_size=2, stride=2, padding=0) self.assertEqual(str(module), 'AvgPool2D(kernel_size=2, stride=2, padding=0)') module = nn.AvgPool3D(kernel_size=2, stride=2, padding=0) self.assertEqual(str(module), 'AvgPool3D(kernel_size=2, stride=2, padding=0)') module = nn.MaxPool1D(kernel_size=2, stride=2, padding=0) self.assertEqual(str(module), 'MaxPool1D(kernel_size=2, stride=2, padding=0)') module = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) self.assertEqual(str(module), 'MaxPool2D(kernel_size=2, stride=2, padding=0)') module = nn.MaxPool3D(kernel_size=2, stride=2, padding=0) self.assertEqual(str(module), 'MaxPool3D(kernel_size=2, stride=2, padding=0)') module = nn.AdaptiveAvgPool1D(output_size=16) self.assertEqual(str(module), 'AdaptiveAvgPool1D(output_size=16)') module = nn.AdaptiveAvgPool2D(output_size=3) self.assertEqual(str(module), 'AdaptiveAvgPool2D(output_size=3)') module = nn.AdaptiveAvgPool3D(output_size=3) self.assertEqual(str(module), 'AdaptiveAvgPool3D(output_size=3)') module = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True) self.assertEqual( str(module), 'AdaptiveMaxPool1D(output_size=16, return_mask=True)') module = nn.AdaptiveMaxPool2D(output_size=3, return_mask=True) self.assertEqual(str(module), 'AdaptiveMaxPool2D(output_size=3, return_mask=True)') module = nn.AdaptiveMaxPool3D(output_size=3, return_mask=True) self.assertEqual(str(module), 'AdaptiveMaxPool3D(output_size=3, return_mask=True)') module = nn.SimpleRNNCell(16, 32) self.assertEqual(str(module), 'SimpleRNNCell(16, 32)') module = nn.LSTMCell(16, 32) self.assertEqual(str(module), 'LSTMCell(16, 32)') module = nn.GRUCell(16, 32) self.assertEqual(str(module), 'GRUCell(16, 32)') module = nn.PixelShuffle(3) self.assertEqual(str(module), 'PixelShuffle(upscale_factor=3)') module = nn.SimpleRNN(16, 32, 2) self.assertEqual( str(module), 'SimpleRNN(16, 32, num_layers=2\n (0): RNN(\n (cell): SimpleRNNCell(16, 32)\n )\n (1): RNN(\n (cell): SimpleRNNCell(32, 32)\n )\n)' ) module = nn.LSTM(16, 32, 2) self.assertEqual( str(module), 'LSTM(16, 32, num_layers=2\n (0): RNN(\n (cell): LSTMCell(16, 32)\n )\n (1): RNN(\n (cell): LSTMCell(32, 32)\n )\n)' ) module = nn.GRU(16, 32, 2) self.assertEqual( str(module), 'GRU(16, 32, num_layers=2\n (0): RNN(\n (cell): GRUCell(16, 32)\n )\n (1): RNN(\n (cell): GRUCell(32, 32)\n )\n)' ) module1 = nn.Sequential( ('conv1', nn.Conv2D(1, 20, 5)), ('relu1', nn.ReLU()), ('conv2', nn.Conv2D(20, 64, 5)), ('relu2', nn.ReLU())) self.assertEqual( str(module1), 'Sequential(\n '\ '(conv1): Conv2D(1, 20, kernel_size=[5, 5], data_format=NCHW)\n '\ '(relu1): ReLU()\n '\ '(conv2): Conv2D(20, 64, kernel_size=[5, 5], data_format=NCHW)\n '\ '(relu2): ReLU()\n)' ) module2 = nn.Sequential( nn.Conv3DTranspose(4, 6, (3, 3, 3)), nn.AvgPool3D(kernel_size=2, stride=2, padding=0), nn.Tanh(name="Tanh"), module1, nn.Conv3D(4, 6, (3, 3, 3)), nn.MaxPool3D(kernel_size=2, stride=2, padding=0), nn.GELU(True)) self.assertEqual( str(module2), 'Sequential(\n '\ '(0): Conv3DTranspose(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)\n '\ '(1): AvgPool3D(kernel_size=2, stride=2, padding=0)\n '\ '(2): Tanh(name=Tanh)\n '\ '(3): Sequential(\n (conv1): Conv2D(1, 20, kernel_size=[5, 5], data_format=NCHW)\n (relu1): ReLU()\n'\ ' (conv2): Conv2D(20, 64, kernel_size=[5, 5], data_format=NCHW)\n (relu2): ReLU()\n )\n '\ '(4): Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)\n '\ '(5): MaxPool3D(kernel_size=2, stride=2, padding=0)\n '\ '(6): GELU(approximate=True)\n)' )
def __init__(self, _emb_size, _n_layer, _n_head, _voc_size, _max_position_seq_len, _sent_types, hidden_act, _dropout, _attention_dropout, initializer_range): super(BertModel, self).__init__() self._emb_size = _emb_size self._n_layer = _n_layer self._n_head = _n_head self._voc_size = _voc_size self._max_position_seq_len = _max_position_seq_len self._sent_types = _sent_types hidden_act = hidden_act if hidden_act == "gelu": self._hidden_act = nn.GELU() else: self._hidden_act = nn.ReLU() self._dropout = _dropout self._attention_dropout = _attention_dropout self._word_emb_name = "word_embedding" self._pos_emb_name = "pos_embedding" self._sent_emb_name = "sent_embedding" self._dtype = "float32" self._param_initializer = nn.initializer.TruncatedNormal( std=initializer_range) self.word_emb = nn.Embedding( num_embeddings=self._voc_size, embedding_dim=self._emb_size, name=self._word_emb_name, weight_attr=paddle.ParamAttr(initializer=self._param_initializer), sparse=False) self.position_emb = nn.Embedding( num_embeddings=self._max_position_seq_len, embedding_dim=self._emb_size, weight_attr=paddle.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer), sparse=False) self.sent_emb = nn.Embedding(num_embeddings=self._sent_types, embedding_dim=self._emb_size, weight_attr=paddle.ParamAttr( name=self._sent_emb_name, initializer=self._param_initializer), sparse=False) self.enc_pre_process_layer = NormalizeDropLayer(self._dropout, self._emb_size, name='pre_encoder') self._enc_out_layer = Encoder( n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, attention_dropout=self._attention_dropout, hidden_act=self._hidden_act, param_initializer=self._param_initializer, name='encoder') self.mask_trans_feat = nn.Linear( in_features=self._emb_size, out_features=self._emb_size, weight_attr=paddle.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self._param_initializer), bias_attr=paddle.ParamAttr(name='mask_lm_trans_fc.b_0')) self.mask_trans_act = self._hidden_act self.mask_post_process_layer = NormalizeLayer(self._emb_size, name='mask_lm_trans') self.mask_lm_out_bias = self.create_parameter( shape=[self._voc_size], dtype=self._dtype, attr=paddle.ParamAttr( name="mask_lm_out_fc.b_0", initializer=paddle.nn.initializer.Constant(value=0.0)), is_bias=True)
# See the License for the specific language governing permissions and # limitations under the License. import math import paddle from paddle import nn from .. import PretrainedModel, register_base_model __all__ = [ 'SqueezeBertModel', 'SqueezeBertForSequenceClassification', 'SqueezeBertForTokenClassification', 'SqueezeBertForQuestionAnswering', ] ACT2FN = {'gelu': nn.GELU()} def _convert_attention_mask(attention_mask, inputs): if attention_mask.dim() == 3: extended_attention_mask = attention_mask.unsqueeze(1) elif attention_mask.dim() == 2: # extended_attention_mask = attention_mask[:, None, None, :] extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) extended_attention_mask = paddle.cast(extended_attention_mask, inputs.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask class SqueezeBertEmbeddings(nn.Layer):