def __init__(self, num_feat=1024, num_group=16, dropout=0, forward_expansion=4, additional_output=False, **kwargs): super(EncoderLayer, self).__init__(**kwargs) self.num_feat = num_feat self.num_group = num_group self.dropout = dropout self.forward_expansion = forward_expansion weight_initializer = mx.init.Normal(0.01) self.attention = MultiHeadAttention( num_feat=1024, num_group=16, additional_output=additional_output) self.norm1 = nn.LayerNorm() self.norm2 = nn.LayerNorm() self.dropout_layer = nn.Dropout(self.dropout) self.feed_forward = nn.Sequential() self.feed_forward.add( nn.Dense(forward_expansion * num_feat, weight_initializer=weight_initializer)) self.feed_forward.add(nn.Activation('relu')) self.feed_forward.add( nn.Dense(num_feat, weight_initializer=weight_initializer))
def __init__(self, units, vocab_size, max_length, num_layers, num_heads, dropout=0.0, prefix=None, params=None): super(GPT2Model, self).__init__(prefix=prefix, params=params) self._units = units self._max_length = max_length self._num_layers = num_layers self._num_heads = num_heads with self.name_scope(): self._pos_embed = nn.Embedding(input_dim=max_length, output_dim=units, weight_initializer=mx.init.Normal(0.01), prefix='pos_embed_') self._embed = nn.Embedding(input_dim=vocab_size, output_dim=units, prefix='embed_', weight_initializer=mx.init.Normal(0.02)) self._drop = nn.Dropout(dropout) self._logits_proj = nn.Dense(units=vocab_size, in_units=units, use_bias=False, flatten=False, params=self._embed.params) self._self_attention_layers = nn.HybridSequential() self._ffn_layers = nn.HybridSequential() self._attn_ln = nn.HybridSequential() self._ffn_ln = nn.HybridSequential() for i in range(num_layers): self._self_attention_layers.add(GPT2SelfAttentionLayer( units=units, num_heads=num_heads, dropout=dropout, prefix='self_attn{}_'.format(i))) self._ffn_layers.add(GPT2FFNLayer( units=units, hidden_size=units * 4, dropout=dropout, prefix='ffn{}_'.format(i))) self._attn_ln.add(nn.LayerNorm(prefix='attn_ln{}_'.format(i))) self._ffn_ln.add(nn.LayerNorm(prefix='ffn_ln{}_'.format(i))) self._final_ln = nn.LayerNorm(prefix='final_ln{}_'.format(i))
def __init__( self, context_length: int, prediction_length: int, d_hidden: int, d_var: int, n_head: int, dropout: float = 0.0, **kwargs, ): super(TemporalFusionDecoder, self).__init__(**kwargs) self.context_length = context_length self.prediction_length = prediction_length with self.name_scope(): self.enrich = GatedResidualNetwork( d_hidden=d_hidden, d_static=d_var, dropout=dropout, ) self.attention = SelfAttention( context_length=context_length, prediction_length=prediction_length, d_hidden=d_hidden, n_head=n_head, share_values=True, dropout=dropout, ) self.att_net = nn.HybridSequential(prefix="attention_") self.att_net.add(nn.Dropout(dropout)) self.att_net.add( nn.Dense( units=d_hidden * 2, flatten=False, weight_initializer=init.Xavier(), )) self.att_net.add(GatedLinearUnit( axis=-1, nonlinear=False, )) self.att_lnorm = nn.LayerNorm(axis=-1) self.ff_net = nn.HybridSequential() self.ff_net.add(GatedResidualNetwork( d_hidden, dropout=dropout, )) self.ff_net.add( nn.Dense( units=d_hidden * 2, flatten=False, weight_initializer=init.Xavier(), )) self.ff_net.add(GatedLinearUnit( axis=-1, nonlinear=False, )) self.ff_lnorm = nn.LayerNorm(axis=-1)
def __init__(self, attention_cell='multi_head', units=128, hidden_size=512, num_heads=4, scaled=True, dropout=0.0, use_residual=True, output_attention=False, weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super(TransformerDecoderCell, self).__init__(prefix=prefix, params=params) self._units = units self._num_heads = num_heads self._dropout = dropout self._use_residual = use_residual self._output_attention = output_attention self._scaled = scaled with self.name_scope(): if dropout: self.dropout_layer = nn.Dropout(rate=dropout) self.attention_cell_in = _get_attention_cell(attention_cell, units=units, num_heads=num_heads, scaled=scaled, dropout=dropout) self.attention_cell_inter = _get_attention_cell( attention_cell, units=units, num_heads=num_heads, scaled=scaled, dropout=dropout) self.proj_in = nn.Dense(units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='proj_in_') self.proj_inter = nn.Dense(units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='proj_inter_') self.ffn = PositionwiseFFN(hidden_size=hidden_size, units=units, use_residual=use_residual, dropout=dropout, weight_initializer=weight_initializer, bias_initializer=bias_initializer) self.layer_norm_in = nn.LayerNorm() self.layer_norm_inter = nn.LayerNorm()
def __init__(self, model_dim, head_num, dropout, att_dropout, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self._model_dim = model_dim self._head_num = head_num if self._model_dim % self._head_num != 0: raise ValueError( 'In MultiHeadAttetion, the model_dim should be divided exactly' ' by the number of head_num. Received model_dim={}, head_num={}' .format(model_dim, head_num)) with self.name_scope(): self.queries_dense = nn.Dense(model_dim, use_bias=False, flatten=False, prefix="query_") self.keys_dense = nn.Dense(model_dim, use_bias=False, flatten=False, prefix="keys_") self.values_dense = nn.Dense(model_dim, use_bias=False, flatten=False, prefix="values_") self.att_dropout = nn.Dropout(att_dropout) self.dropout = nn.Dropout(dropout) self.LayerNorm = nn.LayerNorm()
def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1, activation='tanh', weight_initializer=None, bias_initializer=None, use_segmentation=True): super().__init__() self.backbone = backbone self.use_segmentation = use_segmentation self.start_scores = nn.Dense(1, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer) self.end_scores = nn.HybridSequential() self.end_scores.add(nn.Dense(units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) self.end_scores.add(get_activation(activation)) self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps)) self.end_scores.add(nn.Dense(1, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) self.answerable_scores = nn.HybridSequential() self.answerable_scores.add(nn.Dense(units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) self.answerable_scores.add(get_activation(activation)) self.answerable_scores.add(nn.Dropout(dropout_prob)) self.answerable_scores.add(nn.Dense(2, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer))
def __init__(self, *, units=512, hidden_size=2048, dropout=0.0, use_residual=True, ffn1_dropout=False, activation='relu', layer_norm_eps=1e-5, weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super().__init__(prefix=prefix, params=params) self._use_residual = use_residual self._dropout = dropout self._ffn1_dropout = ffn1_dropout with self.name_scope(): self.ffn_1 = nn.Dense(units=hidden_size, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='ffn_1_') self.activation = self._get_activation( activation) if activation else None self.ffn_2 = nn.Dense(units=units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='ffn_2_') if dropout: self.dropout_layer = nn.Dropout(rate=dropout) self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=layer_norm_eps)
def __init__(self, params, train, **kwargs): super(DecoderStack, self).__init__(**kwargs) self.param = params with self.name_scope(): self.layer = nn.Sequential() with self.layer.name_scope(): for i in range(params.num_hidden_layers): self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, train) enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, train) feed_forward_network = fnn_layer.FeedForwardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, train) self.layer.add( PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(enc_dec_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)) self.output_normalization = nn.LayerNorm(axis=-1, epsilon=1e-6)
def __init__(self, backbone, **kwargs): ''' Parameters ---------- backbone: dict, should have 6 keys, "K", "num_of_chev_filters", "num_of_time_filters", "time_conv_kernel_size", "time_conv_strides", "cheb_polynomials" ''' super(ASTGCN_block, self).__init__(**kwargs) K = backbone['K'] num_of_chev_filters = backbone['num_of_chev_filters'] num_of_time_filters = backbone['num_of_time_filters'] time_conv_strides = backbone['time_conv_strides'] with self.name_scope(): self.SAt = Spatial_Attention_layer() self.cheb_conv_SAt = cheb_conv_with_SAt( num_of_filters=num_of_chev_filters, K=K) self.TAt = Temporal_Attention_layer() self.time_conv = nn.Conv2D( channels=num_of_time_filters, kernel_size=(1, 3), padding=(0, 1), strides=(1, time_conv_strides)) self.residual_conv = nn.Conv2D( channels=num_of_time_filters, kernel_size=(1, 1), strides=(1, time_conv_strides)) self.ln = nn.LayerNorm(axis=2)
def __init__(self, units: int = 768, hidden_size: int = 3072, layer_norm_eps: float = 1E-5, hidden_dropout_prob: float = 0.1, weight_initializer=None, bias_initializer='zeros', activation='gelu(tanh)', dtype='float32'): super().__init__() self._units = units self._hidden_size = hidden_size self._layer_norm_eps = layer_norm_eps self._hidden_dropout_prob = hidden_dropout_prob self._weight_initializer = weight_initializer self._bias_initializer = bias_initializer self._activation = activation self._dtype = dtype self.layer_norm = nn.LayerNorm(epsilon=self._layer_norm_eps, in_channels=self._units) self.ffn_1 = nn.Dense(units=self._hidden_size, in_units=self._units, flatten=False, weight_initializer=self._weight_initializer, bias_initializer=self._bias_initializer, dtype=self._dtype) self.activation = get_activation(self._activation) self.ffn_2 = nn.Dense(units=self._units, in_units=self._hidden_size, flatten=False, weight_initializer=self._weight_initializer, bias_initializer=self._bias_initializer, dtype=self._dtype) self.hidden_dropout = nn.Dropout(self._hidden_dropout_prob)
def test_layernorm(dshape): layer = nn.LayerNorm(in_channels=10) print("checking layer {}\nshape: {}.".format(layer, dshape)) layer.initialize() x = mx.np.ones(shape=dshape) x.attach_grad() with mx.autograd.record(): out = layer(x) out.backward() np_out = out.asnumpy() np_dx = x.grad.asnumpy() layer.hybridize() x = mx.np.ones(shape=dshape) x.attach_grad() with mx.autograd.record(): out = layer(x) out.backward() mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-5, atol=1e-6) mx.test_utils.assert_almost_equal(np_dx, x.grad.asnumpy(), rtol=1e-5, atol=1e-6)
def __init__( self, context_length: int, prediction_length: int, d_input: int, d_hidden: int, **kwargs, ) -> None: super(TemporalFusionEncoder, self).__init__(**kwargs) self.context_length = context_length self.prediction_length = prediction_length with self.name_scope(): self.encoder_lstm = rnn.HybridSequentialRNNCell(prefix="encoder_") self.encoder_lstm.add( rnn.LSTMCell( hidden_size=d_hidden, input_size=d_input, )) self.decoder_lstm = rnn.HybridSequentialRNNCell(prefix="decoder_") self.decoder_lstm.add( rnn.LSTMCell( hidden_size=d_hidden, input_size=d_input, )) self.gate = nn.HybridSequential() self.gate.add(nn.Dense(d_hidden * 2, flatten=False)) self.gate.add(GatedLinearUnit(axis=-1, nonlinear=False)) if d_input != d_hidden: self.skip_proj = nn.Dense(d_hidden, flatten=False) self.add_skip = True else: self.add_skip = False self.lnorm = nn.LayerNorm(axis=-1)
def __init__(self, backbone_cfg, weight_initializer=None, bias_initializer=None): """ Parameters ---------- backbone_cfg weight_initializer bias_initializer """ super().__init__() self.backbone_model = AlbertModel.from_cfg(backbone_cfg) if weight_initializer is None: weight_initializer = self.backbone_model.weight_initializer if bias_initializer is None: bias_initializer = self.backbone_model.bias_initializer self.mlm_decoder = nn.HybridSequential() # Extra non-linear layer self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, in_units=self.backbone_model.units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) self.mlm_decoder.add(get_activation(self.backbone_model.activation)) self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, in_channels=self.backbone_model.embed_size)) # only load the dense weights with a re-initialized bias # parameters are stored in 'word_embed_bias' which is # not used in original embedding self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, in_units=self.backbone_model.embed_size, flatten=False, bias_initializer=bias_initializer)) self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
def __init__(self, *, attention_cell='multi_head', num_layers=2, units=512, hidden_size=2048, max_length=50, num_heads=4, scaled=True, scale_embed=True, norm_inputs=True, dropout=0.0, use_residual=True, output_attention=False, output_all_encodings=False, weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super().__init__(prefix=prefix, params=params) assert units % num_heads == 0,\ 'In TransformerEncoder, The units should be divided exactly ' \ 'by the number of heads. Received units={}, num_heads={}' \ .format(units, num_heads) self._max_length = max_length self._units = units self._output_attention = output_attention self._output_all_encodings = output_all_encodings self._dropout = dropout self._scale_embed = scale_embed self._norm_inputs = norm_inputs with self.name_scope(): if dropout: self.dropout_layer = nn.Dropout(rate=dropout) if self._norm_inputs: self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-5) self.position_weight = self.params.get_constant( 'const', _position_encoding_init(max_length, units)) self.transformer_cells = nn.HybridSequential() for i in range(num_layers): cell = TransformerEncoderCell( units=units, hidden_size=hidden_size, num_heads=num_heads, attention_cell=attention_cell, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dropout=dropout, use_residual=use_residual, scaled=scaled, output_attention=output_attention, prefix='transformer%d_' % i) self.transformer_cells.add(cell)
def __init__(self, units=512, hidden_size=2048, dropout=0.0, use_residual=True, weight_initializer=None, bias_initializer='zeros', activation='relu', prefix=None, params=None): super(PositionwiseFFN, self).__init__(prefix=prefix, params=params) self._hidden_size = hidden_size self._units = units self._use_residual = use_residual with self.name_scope(): self.ffn_1 = nn.Dense(units=hidden_size, flatten=False, activation=activation, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='ffn_1_') self.ffn_2 = nn.Dense(units=units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='ffn_2_') self.dropout_layer = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm()
def __init__(self, backbone, **kwargs): super(ST_block, self).__init__(**kwargs) # number of first temporal convolution's filters num_of_time_conv_filters1 = backbone['num_of_time_conv_filters1'] # number of second temporal convolution's filters num_of_time_conv_filters2 = backbone['num_of_time_conv_filters2'] # length of temporal convolutional filter K_t = backbone['K_t'] # number of spatial convolution's filters num_of_cheb_filters = backbone['num_of_cheb_filters'] # number of the order of chebNet K = backbone['K'] # list of chebyshev polynomials from first-order to K-order cheb_polys = backbone['cheb_polys'] with self.name_scope(): self.time_conv1 = temporal_conv_layer(num_of_time_conv_filters1, K_t) self.cheb_conv = cheb_conv(num_of_cheb_filters, K, cheb_polys) self.time_conv2 = temporal_conv_layer(num_of_time_conv_filters2, K_t) self.ln = nn.LayerNorm(axis = 1)
def __init__(self, vocab_size, num_layer=6, model_dim=512, ff_dim=2048, h=8, dropout=0.1): super().__init__() self.num_layer = num_layer self.model_dim = model_dim self.dropout = dropout self.h = h with self.name_scope(): self.decoder_layers = [ DecoderLayer(model_dim=model_dim, ff_dim=ff_dim, h=h, dropout=dropout) for _ in range(num_layer) ] register_children(self, self.decoder_layers) self.norm = nn.LayerNorm() self.positional_embedding = PositionalEmbedding(vocab_size, model_dim, dropout=dropout)
def __init__( self, d_model: int, d_hidden: int, activation: str = "softrelu", pre_ln: bool = True, dropout: float = 0.0, **kwargs, ): super(PosFFN, self).__init__(**kwargs) self.pre_ln = pre_ln with self.name_scope(): self.linear1 = nn.Dense( units=d_hidden, use_bias=True, flatten=False, activation=activation, weight_initializer=init.Xavier(), ) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Dense( units=d_model, use_bias=True, flatten=False, weight_initializer=init.Xavier(), ) self.lnorm = nn.LayerNorm(axis=-1)
def __init__(self, backbone, **kwargs): ''' Parameters ---------- backbone: dict, should have 5 keys "K", "num_of_chev_filters", "num_of_time_filters", "time_conv_strides", "cheb_polynomials" ''' super(MSTGCN_block, self).__init__(**kwargs) K = backbone['K'] num_of_chev_filters = backbone['num_of_chev_filters'] num_of_time_filters = backbone['num_of_time_filters'] time_conv_strides = backbone['time_conv_strides'] cheb_polynomials = backbone["cheb_polynomials"] with self.name_scope(): self.cheb_conv = cheb_conv(num_of_filters=num_of_chev_filters, K=K, cheb_polynomials=cheb_polynomials) self.time_conv = nn.Conv2D(channels=num_of_time_filters, kernel_size=(1, 3), padding=(0, 1), strides=(1, time_conv_strides)) self.residual_conv = nn.Conv2D(channels=num_of_time_filters, kernel_size=(1, 1), strides=(1, time_conv_strides)) self.ln = nn.LayerNorm(axis=2)
def __init__(self, attention_cell=None, units=128, hidden_size=512, num_heads=4, scaled=True, dropout=0.0, attention_dropout=0.0, use_residual=True, output_attention=False, weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super().__init__(prefix=prefix, params=params) assert attention_cell is None self._units = units self._num_heads = num_heads self._dropout = dropout self._use_residual = use_residual self._output_attention = output_attention self._scaled = scaled with self.name_scope(): if dropout: self.dropout_layer = nn.Dropout(rate=dropout) assert units % num_heads == 0 self.attention_cell = PositionalEmbeddingMultiHeadAttentionCell( d_head=units // num_heads, num_heads=num_heads, scaled=scaled, dropout=attention_dropout) self.proj = nn.Dense(units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='proj_') self.ffn = nlp.model.PositionwiseFFN(hidden_size=hidden_size, units=units, use_residual=use_residual, dropout=dropout, ffn1_dropout=True, activation='relu', weight_initializer=weight_initializer, bias_initializer=bias_initializer, layer_norm_eps=1e-12) self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-12)
def __init__( self, d_hidden: int, d_input: Optional[int] = None, d_output: Optional[int] = None, d_static: Optional[int] = None, dropout: float = 0.0, **kwargs, ): super(GatedResidualNetwork, self).__init__(**kwargs) self.d_hidden = d_hidden self.d_input = d_input or d_hidden self.d_static = d_static or 0 if d_output is None: self.d_output = self.d_input self.add_skip = False else: self.d_output = d_output if d_output != self.d_input: self.add_skip = True with self.name_scope(): self.skip_proj = nn.Dense( units=self.d_output, in_units=self.d_input, flatten=False, weight_initializer=init.Xavier(), ) else: self.add_skip = False with self.name_scope(): self.mlp = nn.HybridSequential(prefix="mlp_") self.mlp.add( nn.Dense( units=self.d_hidden, in_units=self.d_input + self.d_static, flatten=False, weight_initializer=init.Xavier(), )) self.mlp.add(nn.ELU()) self.mlp.add( nn.Dense( units=self.d_hidden, in_units=self.d_hidden, flatten=False, weight_initializer=init.Xavier(), )) self.mlp.add(nn.Dropout(dropout)), self.mlp.add( nn.Dense( units=self.d_output * 2, in_units=self.d_hidden, flatten=False, weight_initializer=init.Xavier(), )) self.mlp.add(GatedLinearUnit( axis=-1, nonlinear=False, )) self.lnorm = nn.LayerNorm(axis=-1, in_channels=self.d_output)
def __init__(self, units=768, is_eval=False, prefix=None, params=None): super(PoolerEndLogits, self).__init__(prefix=prefix, params=params) self._eval = is_eval self._hsz = units with self.name_scope(): self.dense_0 = nn.Dense(units, activation='tanh', flatten=False) self.dense_1 = nn.Dense(1, flatten=False) self.layernorm = nn.LayerNorm(epsilon=1e-12, in_channels=units)
def __init__(self, units, heads, hidden_size, qkv_bias=False, att_drop=0., drop=0., activation='gelu', layer_norm_eps=1e-12): super(_TransformerEncoder, self).__init__() with self.name_scope(): self.norm1 = nn.LayerNorm(epsilon=layer_norm_eps, in_channels=units) self.att = _MultiHeadAttention(units, heads, qkv_bias, att_drop, drop) self.norm2 = nn.LayerNorm(epsilon=layer_norm_eps, ) self.mlp = _MLP(units, hidden_size, activation, drop)
def __init__(self, c_in, T, num_of_vertices, activation='GLU', **kwargs): super(Output_layer, self).__init__(**kwargs) self.c_in = c_in self.layer = nn.HybridSequential() self.layer.add(Temporal_conv_layer(T, c_in, c_in, activation), nn.LayerNorm(axis=1), Temporal_conv_layer(1, c_in, c_in, 'sigmoid'), nn.Conv2D(1, (1, 1), activation=None))
def __init__(self, hidden_size, output_size, dropout=0.0, **kwargs): super(FeedForward, self).__init__(**kwargs) with self.name_scope(): self.dense1 = nn.Dense(hidden_size, activation='relu', flatten=False) self.dense2 = nn.Dense(output_size, flatten=False) self.layer_norm = nn.LayerNorm() self.dropout = nn.Dropout(dropout)
def __init__(self, layer_num=6, **kwargs): super(SimpleNet, self).__init__(**kwargs) self._layer_num = layer_num self.ln_l = nn.HybridSequential() self.dense_l = nn.HybridSequential() for i in range(layer_num): self.dense_l.add( nn.Dense(units=32 + layer_num - 1 - i, flatten=False)) self.ln_l.add(nn.LayerNorm())
def __init__(self, layer, params, train, **kwargs): super(PrePostProcessingWrapper, self).__init__(**kwargs) self.postprocess_dropout = params.layer_postprocess_dropout self.train = train with self.name_scope(): self.layer = layer self.layer_norm = nn.LayerNorm(epsilon=1e-6) self.dropout = nn.Dropout(1 - self.postprocess_dropout)
def __init__(self, backbone_cfg, weight_initializer=None, bias_initializer=None): """ Parameters ---------- backbone_cfg The cfg of the backbone model weight_initializer bias_initializer """ super().__init__() self.backbone_model = MobileBertModel.from_cfg(backbone_cfg) if weight_initializer is None: weight_initializer = self.backbone_model.weight_initializer if bias_initializer is None: bias_initializer = self.backbone_model.bias_initializer # Construct nsp_classifier for next sentence prediction self.nsp_classifier = nn.Dense(units=2, in_units=self.backbone_model.units, weight_initializer=weight_initializer, dtype=self.backbone_model.dtype) self.mlm_decoder = nn.HybridSequential() # Extra non-linear layer self.mlm_decoder.add( nn.Dense(units=self.backbone_model.units, in_units=self.backbone_model.units, flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=self.backbone_model.dtype)) self.mlm_decoder.add(get_activation(self.backbone_model.activation)) # use basic layer normalization for pretaining self.mlm_decoder.add( nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, in_channels=self.backbone_model.units)) # only load the dense weights with a re-initialized bias # parameters are stored in 'word_embed_bias' which is # not used in original embedding self.embedding_table = nn.Dense( units=self.backbone_model.vocab_size, in_units=self.backbone_model.embed_size, flatten=False, bias_initializer=bias_initializer, dtype=self.backbone_model.dtype) self.embedding_table.weight = self.backbone_model.word_embed.weight if self.backbone_model.embed_size != self.backbone_model.units: self.extra_table = nn.Dense(units=self.backbone_model.vocab_size, in_units=self.backbone_model.units - self.backbone_model.embed_size, flatten=False, use_bias=False, bias_initializer=bias_initializer, dtype=self.backbone_model.dtype)
def __init__(self, attention_cell='multi_head', num_layers=2, units=128, hidden_size=2048, max_length=50, num_heads=4, scaled=True, scale_embed=True, norm_inputs=True, dropout=0.0, use_residual=True, output_attention=False, weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super().__init__(prefix=prefix, params=params) assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \ 'exactly by the number of heads. Received units={}, ' \ 'num_heads={}'.format(units, num_heads) self._num_layers = num_layers self._units = units self._hidden_size = hidden_size self._num_states = num_heads self._max_length = max_length self._dropout = dropout self._use_residual = use_residual self._output_attention = output_attention self._scaled = scaled self._scale_embed = scale_embed self._norm_inputs = norm_inputs with self.name_scope(): if dropout: self.dropout_layer = nn.Dropout(rate=dropout) if self._norm_inputs: self.layer_norm = nn.LayerNorm() encoding = _position_encoding_init(max_length, units) self.position_weight = self.params.get_constant( 'const', encoding.astype(np.float32)) self.transformer_cells = nn.HybridSequential() for i in range(num_layers): self.transformer_cells.add( TransformerDecoderCell( units=units, hidden_size=hidden_size, num_heads=num_heads, attention_cell=attention_cell, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dropout=dropout, scaled=scaled, use_residual=use_residual, output_attention=output_attention, prefix='transformer%d_' % i))
def __init__(self, model_dim, dropout =0.1): super(Resblock, self).__init__() self.model_dim = model_dim self.dropout = dropout self.resblock = nn.Sequential() with self.resblock.name_scope(): self.resblock.add(nn.LayerNorm()) self.resblock.add(nn.Dense(2*self.model_dim,in_units= self.model_dim,activation="relu")) self.resblock.add(nn.Dropout(self.dropout)) self.resblock.add(nn.Dense(self.model_dim,in_units = 2*self.model_dim)) self.resblock.add(nn.Dropout(self.dropout))