示例#1
0
    def __init__(self,
                 num_feat=1024,
                 num_group=16,
                 dropout=0,
                 forward_expansion=4,
                 additional_output=False,
                 **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.num_feat = num_feat
        self.num_group = num_group
        self.dropout = dropout
        self.forward_expansion = forward_expansion
        weight_initializer = mx.init.Normal(0.01)

        self.attention = MultiHeadAttention(
            num_feat=1024, num_group=16, additional_output=additional_output)
        self.norm1 = nn.LayerNorm()
        self.norm2 = nn.LayerNorm()
        self.dropout_layer = nn.Dropout(self.dropout)

        self.feed_forward = nn.Sequential()
        self.feed_forward.add(
            nn.Dense(forward_expansion * num_feat,
                     weight_initializer=weight_initializer))
        self.feed_forward.add(nn.Activation('relu'))
        self.feed_forward.add(
            nn.Dense(num_feat, weight_initializer=weight_initializer))
示例#2
0
 def __init__(self, units, vocab_size, max_length, num_layers, num_heads, dropout=0.0,
              prefix=None, params=None):
     super(GPT2Model, self).__init__(prefix=prefix, params=params)
     self._units = units
     self._max_length = max_length
     self._num_layers = num_layers
     self._num_heads = num_heads
     with self.name_scope():
         self._pos_embed = nn.Embedding(input_dim=max_length, output_dim=units,
                                        weight_initializer=mx.init.Normal(0.01),
                                        prefix='pos_embed_')
         self._embed = nn.Embedding(input_dim=vocab_size, output_dim=units, prefix='embed_',
                                    weight_initializer=mx.init.Normal(0.02))
         self._drop = nn.Dropout(dropout)
         self._logits_proj = nn.Dense(units=vocab_size, in_units=units, use_bias=False,
                                      flatten=False, params=self._embed.params)
         self._self_attention_layers = nn.HybridSequential()
         self._ffn_layers = nn.HybridSequential()
         self._attn_ln = nn.HybridSequential()
         self._ffn_ln = nn.HybridSequential()
         for i in range(num_layers):
             self._self_attention_layers.add(GPT2SelfAttentionLayer(
                 units=units, num_heads=num_heads, dropout=dropout,
                 prefix='self_attn{}_'.format(i)))
             self._ffn_layers.add(GPT2FFNLayer(
                 units=units, hidden_size=units * 4, dropout=dropout, prefix='ffn{}_'.format(i)))
             self._attn_ln.add(nn.LayerNorm(prefix='attn_ln{}_'.format(i)))
             self._ffn_ln.add(nn.LayerNorm(prefix='ffn_ln{}_'.format(i)))
             self._final_ln = nn.LayerNorm(prefix='final_ln{}_'.format(i))
示例#3
0
    def __init__(
        self,
        context_length: int,
        prediction_length: int,
        d_hidden: int,
        d_var: int,
        n_head: int,
        dropout: float = 0.0,
        **kwargs,
    ):
        super(TemporalFusionDecoder, self).__init__(**kwargs)
        self.context_length = context_length
        self.prediction_length = prediction_length

        with self.name_scope():
            self.enrich = GatedResidualNetwork(
                d_hidden=d_hidden,
                d_static=d_var,
                dropout=dropout,
            )
            self.attention = SelfAttention(
                context_length=context_length,
                prediction_length=prediction_length,
                d_hidden=d_hidden,
                n_head=n_head,
                share_values=True,
                dropout=dropout,
            )
            self.att_net = nn.HybridSequential(prefix="attention_")
            self.att_net.add(nn.Dropout(dropout))
            self.att_net.add(
                nn.Dense(
                    units=d_hidden * 2,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                ))
            self.att_net.add(GatedLinearUnit(
                axis=-1,
                nonlinear=False,
            ))
            self.att_lnorm = nn.LayerNorm(axis=-1)
            self.ff_net = nn.HybridSequential()
            self.ff_net.add(GatedResidualNetwork(
                d_hidden,
                dropout=dropout,
            ))
            self.ff_net.add(
                nn.Dense(
                    units=d_hidden * 2,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                ))
            self.ff_net.add(GatedLinearUnit(
                axis=-1,
                nonlinear=False,
            ))
            self.ff_lnorm = nn.LayerNorm(axis=-1)
示例#4
0
    def __init__(self,
                 attention_cell='multi_head',
                 units=128,
                 hidden_size=512,
                 num_heads=4,
                 scaled=True,
                 dropout=0.0,
                 use_residual=True,
                 output_attention=False,
                 weight_initializer=None,
                 bias_initializer='zeros',
                 prefix=None,
                 params=None):
        super(TransformerDecoderCell, self).__init__(prefix=prefix,
                                                     params=params)
        self._units = units
        self._num_heads = num_heads
        self._dropout = dropout
        self._use_residual = use_residual
        self._output_attention = output_attention
        self._scaled = scaled
        with self.name_scope():
            if dropout:
                self.dropout_layer = nn.Dropout(rate=dropout)
            self.attention_cell_in = _get_attention_cell(attention_cell,
                                                         units=units,
                                                         num_heads=num_heads,
                                                         scaled=scaled,
                                                         dropout=dropout)
            self.attention_cell_inter = _get_attention_cell(
                attention_cell,
                units=units,
                num_heads=num_heads,
                scaled=scaled,
                dropout=dropout)
            self.proj_in = nn.Dense(units=units,
                                    flatten=False,
                                    use_bias=False,
                                    weight_initializer=weight_initializer,
                                    bias_initializer=bias_initializer,
                                    prefix='proj_in_')
            self.proj_inter = nn.Dense(units=units,
                                       flatten=False,
                                       use_bias=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer,
                                       prefix='proj_inter_')
            self.ffn = PositionwiseFFN(hidden_size=hidden_size,
                                       units=units,
                                       use_residual=use_residual,
                                       dropout=dropout,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer)

            self.layer_norm_in = nn.LayerNorm()
            self.layer_norm_inter = nn.LayerNorm()
示例#5
0
 def __init__(self, model_dim, head_num, dropout, att_dropout, **kwargs):
     super(MultiHeadAttention, self).__init__(**kwargs)
     self._model_dim = model_dim
     self._head_num = head_num
     if self._model_dim % self._head_num != 0:
         raise ValueError(
             'In MultiHeadAttetion, the model_dim should be divided exactly'
             ' by the number of head_num. Received model_dim={}, head_num={}'
             .format(model_dim, head_num))
     with self.name_scope():
         self.queries_dense = nn.Dense(model_dim,
                                       use_bias=False,
                                       flatten=False,
                                       prefix="query_")
         self.keys_dense = nn.Dense(model_dim,
                                    use_bias=False,
                                    flatten=False,
                                    prefix="keys_")
         self.values_dense = nn.Dense(model_dim,
                                      use_bias=False,
                                      flatten=False,
                                      prefix="values_")
         self.att_dropout = nn.Dropout(att_dropout)
         self.dropout = nn.Dropout(dropout)
         self.LayerNorm = nn.LayerNorm()
示例#6
0
 def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
              activation='tanh', weight_initializer=None, bias_initializer=None,
              use_segmentation=True):
     super().__init__()
     self.backbone = backbone
     self.use_segmentation = use_segmentation
     self.start_scores = nn.Dense(1, flatten=False,
                                  weight_initializer=weight_initializer,
                                  bias_initializer=bias_initializer)
     self.end_scores = nn.HybridSequential()
     self.end_scores.add(nn.Dense(units, flatten=False,
                                  weight_initializer=weight_initializer,
                                  bias_initializer=bias_initializer))
     self.end_scores.add(get_activation(activation))
     self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
     self.end_scores.add(nn.Dense(1, flatten=False,
                                  weight_initializer=weight_initializer,
                                  bias_initializer=bias_initializer))
     self.answerable_scores = nn.HybridSequential()
     self.answerable_scores.add(nn.Dense(units, flatten=False,
                                         weight_initializer=weight_initializer,
                                         bias_initializer=bias_initializer))
     self.answerable_scores.add(get_activation(activation))
     self.answerable_scores.add(nn.Dropout(dropout_prob))
     self.answerable_scores.add(nn.Dense(2, flatten=False,
                                         weight_initializer=weight_initializer,
                                         bias_initializer=bias_initializer))
示例#7
0
 def __init__(self,
              *,
              units=512,
              hidden_size=2048,
              dropout=0.0,
              use_residual=True,
              ffn1_dropout=False,
              activation='relu',
              layer_norm_eps=1e-5,
              weight_initializer=None,
              bias_initializer='zeros',
              prefix=None,
              params=None):
     super().__init__(prefix=prefix, params=params)
     self._use_residual = use_residual
     self._dropout = dropout
     self._ffn1_dropout = ffn1_dropout
     with self.name_scope():
         self.ffn_1 = nn.Dense(units=hidden_size,
                               flatten=False,
                               weight_initializer=weight_initializer,
                               bias_initializer=bias_initializer,
                               prefix='ffn_1_')
         self.activation = self._get_activation(
             activation) if activation else None
         self.ffn_2 = nn.Dense(units=units,
                               flatten=False,
                               weight_initializer=weight_initializer,
                               bias_initializer=bias_initializer,
                               prefix='ffn_2_')
         if dropout:
             self.dropout_layer = nn.Dropout(rate=dropout)
         self.layer_norm = nn.LayerNorm(in_channels=units,
                                        epsilon=layer_norm_eps)
示例#8
0
    def __init__(self, params, train, **kwargs):
        super(DecoderStack, self).__init__(**kwargs)
        self.param = params
        with self.name_scope():
            self.layer = nn.Sequential()
            with self.layer.name_scope():
                for i in range(params.num_hidden_layers):
                    self_attention_layer = attention_layer.SelfAttention(
                        params.hidden_size, params.num_heads,
                        params.attention_dropout, train)
                    enc_dec_attention_layer = attention_layer.Attention(
                        params.hidden_size, params.num_heads,
                        params.attention_dropout, train)
                    feed_forward_network = fnn_layer.FeedForwardNetwork(
                        params.hidden_size, params.filter_size,
                        params.relu_dropout, train)

                    self.layer.add(
                        PrePostProcessingWrapper(self_attention_layer, params,
                                                 train),
                        PrePostProcessingWrapper(enc_dec_attention_layer,
                                                 params, train),
                        PrePostProcessingWrapper(feed_forward_network, params,
                                                 train))
            self.output_normalization = nn.LayerNorm(axis=-1, epsilon=1e-6)
示例#9
0
    def __init__(self, backbone, **kwargs):
        '''
        Parameters
        ----------
        backbone: dict, should have 6 keys,
                        "K",
                        "num_of_chev_filters",
                        "num_of_time_filters",
                        "time_conv_kernel_size",
                        "time_conv_strides",
                        "cheb_polynomials"
        '''
        super(ASTGCN_block, self).__init__(**kwargs)

        K = backbone['K']
        num_of_chev_filters = backbone['num_of_chev_filters']
        num_of_time_filters = backbone['num_of_time_filters']
        time_conv_strides = backbone['time_conv_strides']

        with self.name_scope():
            self.SAt = Spatial_Attention_layer()
            self.cheb_conv_SAt = cheb_conv_with_SAt(
                num_of_filters=num_of_chev_filters,
                K=K)
            self.TAt = Temporal_Attention_layer()
            self.time_conv = nn.Conv2D(
                channels=num_of_time_filters,
                kernel_size=(1, 3),
                padding=(0, 1),
                strides=(1, time_conv_strides))
            self.residual_conv = nn.Conv2D(
                channels=num_of_time_filters,
                kernel_size=(1, 1),
                strides=(1, time_conv_strides))
            self.ln = nn.LayerNorm(axis=2)
示例#10
0
 def __init__(self,
              units: int = 768,
              hidden_size: int = 3072,
              layer_norm_eps: float = 1E-5,
              hidden_dropout_prob: float = 0.1,
              weight_initializer=None,
              bias_initializer='zeros',
              activation='gelu(tanh)',
              dtype='float32'):
     super().__init__()
     self._units = units
     self._hidden_size = hidden_size
     self._layer_norm_eps = layer_norm_eps
     self._hidden_dropout_prob = hidden_dropout_prob
     self._weight_initializer = weight_initializer
     self._bias_initializer = bias_initializer
     self._activation = activation
     self._dtype = dtype
     self.layer_norm = nn.LayerNorm(epsilon=self._layer_norm_eps,
                                    in_channels=self._units)
     self.ffn_1 = nn.Dense(units=self._hidden_size,
                           in_units=self._units,
                           flatten=False,
                           weight_initializer=self._weight_initializer,
                           bias_initializer=self._bias_initializer,
                           dtype=self._dtype)
     self.activation = get_activation(self._activation)
     self.ffn_2 = nn.Dense(units=self._units,
                           in_units=self._hidden_size,
                           flatten=False,
                           weight_initializer=self._weight_initializer,
                           bias_initializer=self._bias_initializer,
                           dtype=self._dtype)
     self.hidden_dropout = nn.Dropout(self._hidden_dropout_prob)
示例#11
0
def test_layernorm(dshape):
    layer = nn.LayerNorm(in_channels=10)
    print("checking layer {}\nshape: {}.".format(layer, dshape))
    layer.initialize()
    x = mx.np.ones(shape=dshape)
    x.attach_grad()
    with mx.autograd.record():
        out = layer(x)
    out.backward()

    np_out = out.asnumpy()
    np_dx = x.grad.asnumpy()

    layer.hybridize()

    x = mx.np.ones(shape=dshape)
    x.attach_grad()
    with mx.autograd.record():
        out = layer(x)
    out.backward()

    mx.test_utils.assert_almost_equal(np_out,
                                      out.asnumpy(),
                                      rtol=1e-5,
                                      atol=1e-6)
    mx.test_utils.assert_almost_equal(np_dx,
                                      x.grad.asnumpy(),
                                      rtol=1e-5,
                                      atol=1e-6)
示例#12
0
 def __init__(
     self,
     context_length: int,
     prediction_length: int,
     d_input: int,
     d_hidden: int,
     **kwargs,
 ) -> None:
     super(TemporalFusionEncoder, self).__init__(**kwargs)
     self.context_length = context_length
     self.prediction_length = prediction_length
     with self.name_scope():
         self.encoder_lstm = rnn.HybridSequentialRNNCell(prefix="encoder_")
         self.encoder_lstm.add(
             rnn.LSTMCell(
                 hidden_size=d_hidden,
                 input_size=d_input,
             ))
         self.decoder_lstm = rnn.HybridSequentialRNNCell(prefix="decoder_")
         self.decoder_lstm.add(
             rnn.LSTMCell(
                 hidden_size=d_hidden,
                 input_size=d_input,
             ))
         self.gate = nn.HybridSequential()
         self.gate.add(nn.Dense(d_hidden * 2, flatten=False))
         self.gate.add(GatedLinearUnit(axis=-1, nonlinear=False))
         if d_input != d_hidden:
             self.skip_proj = nn.Dense(d_hidden, flatten=False)
             self.add_skip = True
         else:
             self.add_skip = False
         self.lnorm = nn.LayerNorm(axis=-1)
示例#13
0
    def __init__(self, backbone_cfg,
                 weight_initializer=None,
                 bias_initializer=None):
        """

        Parameters
        ----------
        backbone_cfg
        weight_initializer
        bias_initializer
        """
        super().__init__()
        self.backbone_model = AlbertModel.from_cfg(backbone_cfg)
        if weight_initializer is None:
            weight_initializer = self.backbone_model.weight_initializer
        if bias_initializer is None:
            bias_initializer = self.backbone_model.bias_initializer
        self.mlm_decoder = nn.HybridSequential()
        # Extra non-linear layer
        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
                                      in_units=self.backbone_model.units,
                                      flatten=False,
                                      weight_initializer=weight_initializer,
                                      bias_initializer=bias_initializer))
        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
                                          in_channels=self.backbone_model.embed_size))
        # only load the dense weights with a re-initialized bias
        # parameters are stored in 'word_embed_bias' which is
        # not used in original embedding
        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
                                      in_units=self.backbone_model.embed_size,
                                      flatten=False,
                                      bias_initializer=bias_initializer))
        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
示例#14
0
    def __init__(self, *, attention_cell='multi_head', num_layers=2, units=512, hidden_size=2048,
                 max_length=50, num_heads=4, scaled=True, scale_embed=True, norm_inputs=True,
                 dropout=0.0, use_residual=True, output_attention=False, output_all_encodings=False,
                 weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
        super().__init__(prefix=prefix, params=params)
        assert units % num_heads == 0,\
            'In TransformerEncoder, The units should be divided exactly ' \
            'by the number of heads. Received units={}, num_heads={}' \
            .format(units, num_heads)
        self._max_length = max_length
        self._units = units
        self._output_attention = output_attention
        self._output_all_encodings = output_all_encodings
        self._dropout = dropout
        self._scale_embed = scale_embed
        self._norm_inputs = norm_inputs

        with self.name_scope():
            if dropout:
                self.dropout_layer = nn.Dropout(rate=dropout)
            if self._norm_inputs:
                self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-5)
            self.position_weight = self.params.get_constant(
                'const', _position_encoding_init(max_length, units))
            self.transformer_cells = nn.HybridSequential()
            for i in range(num_layers):
                cell = TransformerEncoderCell(
                    units=units, hidden_size=hidden_size, num_heads=num_heads,
                    attention_cell=attention_cell, weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer, dropout=dropout, use_residual=use_residual,
                    scaled=scaled, output_attention=output_attention, prefix='transformer%d_' % i)
                self.transformer_cells.add(cell)
示例#15
0
 def __init__(self,
              units=512,
              hidden_size=2048,
              dropout=0.0,
              use_residual=True,
              weight_initializer=None,
              bias_initializer='zeros',
              activation='relu',
              prefix=None,
              params=None):
     super(PositionwiseFFN, self).__init__(prefix=prefix, params=params)
     self._hidden_size = hidden_size
     self._units = units
     self._use_residual = use_residual
     with self.name_scope():
         self.ffn_1 = nn.Dense(units=hidden_size,
                               flatten=False,
                               activation=activation,
                               weight_initializer=weight_initializer,
                               bias_initializer=bias_initializer,
                               prefix='ffn_1_')
         self.ffn_2 = nn.Dense(units=units,
                               flatten=False,
                               weight_initializer=weight_initializer,
                               bias_initializer=bias_initializer,
                               prefix='ffn_2_')
         self.dropout_layer = nn.Dropout(dropout)
         self.layer_norm = nn.LayerNorm()
示例#16
0
 def __init__(self, backbone, **kwargs):
     super(ST_block, self).__init__(**kwargs)
     
     # number of first temporal convolution's filters
     num_of_time_conv_filters1 = backbone['num_of_time_conv_filters1']
     
     # number of second temporal convolution's filters
     num_of_time_conv_filters2 = backbone['num_of_time_conv_filters2']
     
     # length of temporal convolutional filter
     K_t = backbone['K_t']
     
     # number of spatial convolution's filters
     num_of_cheb_filters = backbone['num_of_cheb_filters']
     
     # number of the order of chebNet
     K = backbone['K']
     
     # list of chebyshev polynomials from first-order to K-order
     cheb_polys = backbone['cheb_polys']
     
     with self.name_scope():
         self.time_conv1 = temporal_conv_layer(num_of_time_conv_filters1, K_t)
         self.cheb_conv = cheb_conv(num_of_cheb_filters, K, cheb_polys)
         self.time_conv2 = temporal_conv_layer(num_of_time_conv_filters2, K_t)
         self.ln = nn.LayerNorm(axis = 1)
示例#17
0
    def __init__(self,
                 vocab_size,
                 num_layer=6,
                 model_dim=512,
                 ff_dim=2048,
                 h=8,
                 dropout=0.1):
        super().__init__()
        self.num_layer = num_layer
        self.model_dim = model_dim
        self.dropout = dropout
        self.h = h

        with self.name_scope():
            self.decoder_layers = [
                DecoderLayer(model_dim=model_dim,
                             ff_dim=ff_dim,
                             h=h,
                             dropout=dropout) for _ in range(num_layer)
            ]
            register_children(self, self.decoder_layers)
            self.norm = nn.LayerNorm()
            self.positional_embedding = PositionalEmbedding(vocab_size,
                                                            model_dim,
                                                            dropout=dropout)
示例#18
0
 def __init__(
     self,
     d_model: int,
     d_hidden: int,
     activation: str = "softrelu",
     pre_ln: bool = True,
     dropout: float = 0.0,
     **kwargs,
 ):
     super(PosFFN, self).__init__(**kwargs)
     self.pre_ln = pre_ln
     with self.name_scope():
         self.linear1 = nn.Dense(
             units=d_hidden,
             use_bias=True,
             flatten=False,
             activation=activation,
             weight_initializer=init.Xavier(),
         )
         self.dropout = nn.Dropout(dropout)
         self.linear2 = nn.Dense(
             units=d_model,
             use_bias=True,
             flatten=False,
             weight_initializer=init.Xavier(),
         )
         self.lnorm = nn.LayerNorm(axis=-1)
示例#19
0
    def __init__(self, backbone, **kwargs):
        '''
        Parameters
        ----------
        backbone: dict, should have 5 keys
                        "K",
                        "num_of_chev_filters",
                        "num_of_time_filters",
                        "time_conv_strides",
                        "cheb_polynomials"
        '''
        super(MSTGCN_block, self).__init__(**kwargs)

        K = backbone['K']
        num_of_chev_filters = backbone['num_of_chev_filters']
        num_of_time_filters = backbone['num_of_time_filters']
        time_conv_strides = backbone['time_conv_strides']
        cheb_polynomials = backbone["cheb_polynomials"]

        with self.name_scope():
            self.cheb_conv = cheb_conv(num_of_filters=num_of_chev_filters,
                                       K=K,
                                       cheb_polynomials=cheb_polynomials)
            self.time_conv = nn.Conv2D(channels=num_of_time_filters,
                                       kernel_size=(1, 3),
                                       padding=(0, 1),
                                       strides=(1, time_conv_strides))
            self.residual_conv = nn.Conv2D(channels=num_of_time_filters,
                                           kernel_size=(1, 1),
                                           strides=(1, time_conv_strides))
            self.ln = nn.LayerNorm(axis=2)
示例#20
0
 def __init__(self, attention_cell=None, units=128, hidden_size=512, num_heads=4, scaled=True,
              dropout=0.0, attention_dropout=0.0, use_residual=True, output_attention=False,
              weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
     super().__init__(prefix=prefix, params=params)
     assert attention_cell is None
     self._units = units
     self._num_heads = num_heads
     self._dropout = dropout
     self._use_residual = use_residual
     self._output_attention = output_attention
     self._scaled = scaled
     with self.name_scope():
         if dropout:
             self.dropout_layer = nn.Dropout(rate=dropout)
         assert units % num_heads == 0
         self.attention_cell = PositionalEmbeddingMultiHeadAttentionCell(
             d_head=units // num_heads, num_heads=num_heads, scaled=scaled,
             dropout=attention_dropout)
         self.proj = nn.Dense(units=units, flatten=False, use_bias=False,
                              weight_initializer=weight_initializer,
                              bias_initializer=bias_initializer, prefix='proj_')
         self.ffn = nlp.model.PositionwiseFFN(hidden_size=hidden_size, units=units,
                                              use_residual=use_residual, dropout=dropout,
                                              ffn1_dropout=True, activation='relu',
                                              weight_initializer=weight_initializer,
                                              bias_initializer=bias_initializer,
                                              layer_norm_eps=1e-12)
         self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-12)
示例#21
0
    def __init__(
        self,
        d_hidden: int,
        d_input: Optional[int] = None,
        d_output: Optional[int] = None,
        d_static: Optional[int] = None,
        dropout: float = 0.0,
        **kwargs,
    ):
        super(GatedResidualNetwork, self).__init__(**kwargs)
        self.d_hidden = d_hidden
        self.d_input = d_input or d_hidden
        self.d_static = d_static or 0
        if d_output is None:
            self.d_output = self.d_input
            self.add_skip = False
        else:
            self.d_output = d_output
            if d_output != self.d_input:
                self.add_skip = True
                with self.name_scope():
                    self.skip_proj = nn.Dense(
                        units=self.d_output,
                        in_units=self.d_input,
                        flatten=False,
                        weight_initializer=init.Xavier(),
                    )
            else:
                self.add_skip = False

        with self.name_scope():
            self.mlp = nn.HybridSequential(prefix="mlp_")
            self.mlp.add(
                nn.Dense(
                    units=self.d_hidden,
                    in_units=self.d_input + self.d_static,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                ))
            self.mlp.add(nn.ELU())
            self.mlp.add(
                nn.Dense(
                    units=self.d_hidden,
                    in_units=self.d_hidden,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                ))
            self.mlp.add(nn.Dropout(dropout)),
            self.mlp.add(
                nn.Dense(
                    units=self.d_output * 2,
                    in_units=self.d_hidden,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                ))
            self.mlp.add(GatedLinearUnit(
                axis=-1,
                nonlinear=False,
            ))
            self.lnorm = nn.LayerNorm(axis=-1, in_channels=self.d_output)
示例#22
0
 def __init__(self, units=768, is_eval=False, prefix=None, params=None):
     super(PoolerEndLogits, self).__init__(prefix=prefix, params=params)
     self._eval = is_eval
     self._hsz = units
     with self.name_scope():
         self.dense_0 = nn.Dense(units, activation='tanh', flatten=False)
         self.dense_1 = nn.Dense(1, flatten=False)
         self.layernorm = nn.LayerNorm(epsilon=1e-12, in_channels=units)
示例#23
0
 def __init__(self,
              units,
              heads,
              hidden_size,
              qkv_bias=False,
              att_drop=0.,
              drop=0.,
              activation='gelu',
              layer_norm_eps=1e-12):
     super(_TransformerEncoder, self).__init__()
     with self.name_scope():
         self.norm1 = nn.LayerNorm(epsilon=layer_norm_eps,
                                   in_channels=units)
         self.att = _MultiHeadAttention(units, heads, qkv_bias, att_drop,
                                        drop)
         self.norm2 = nn.LayerNorm(epsilon=layer_norm_eps, )
         self.mlp = _MLP(units, hidden_size, activation, drop)
示例#24
0
 def __init__(self, c_in, T, num_of_vertices, activation='GLU', **kwargs):
     super(Output_layer, self).__init__(**kwargs)
     self.c_in = c_in
     self.layer = nn.HybridSequential()
     self.layer.add(Temporal_conv_layer(T, c_in, c_in, activation),
                    nn.LayerNorm(axis=1),
                    Temporal_conv_layer(1, c_in, c_in, 'sigmoid'),
                    nn.Conv2D(1, (1, 1), activation=None))
示例#25
0
 def __init__(self, hidden_size, output_size, dropout=0.0, **kwargs):
     super(FeedForward, self).__init__(**kwargs)
     with self.name_scope():
         self.dense1 = nn.Dense(hidden_size,
                                activation='relu',
                                flatten=False)
         self.dense2 = nn.Dense(output_size, flatten=False)
         self.layer_norm = nn.LayerNorm()
         self.dropout = nn.Dropout(dropout)
示例#26
0
 def __init__(self, layer_num=6, **kwargs):
     super(SimpleNet, self).__init__(**kwargs)
     self._layer_num = layer_num
     self.ln_l = nn.HybridSequential()
     self.dense_l = nn.HybridSequential()
     for i in range(layer_num):
         self.dense_l.add(
             nn.Dense(units=32 + layer_num - 1 - i, flatten=False))
         self.ln_l.add(nn.LayerNorm())
示例#27
0
    def __init__(self, layer, params, train, **kwargs):
        super(PrePostProcessingWrapper, self).__init__(**kwargs)
        self.postprocess_dropout = params.layer_postprocess_dropout
        self.train = train

        with self.name_scope():
            self.layer = layer
            self.layer_norm = nn.LayerNorm(epsilon=1e-6)
            self.dropout = nn.Dropout(1 - self.postprocess_dropout)
示例#28
0
    def __init__(self,
                 backbone_cfg,
                 weight_initializer=None,
                 bias_initializer=None):
        """

        Parameters
        ----------
        backbone_cfg
            The cfg of the backbone model
        weight_initializer
        bias_initializer
        """
        super().__init__()
        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg)
        if weight_initializer is None:
            weight_initializer = self.backbone_model.weight_initializer
        if bias_initializer is None:
            bias_initializer = self.backbone_model.bias_initializer
        # Construct nsp_classifier for next sentence prediction
        self.nsp_classifier = nn.Dense(units=2,
                                       in_units=self.backbone_model.units,
                                       weight_initializer=weight_initializer,
                                       dtype=self.backbone_model.dtype)
        self.mlm_decoder = nn.HybridSequential()
        # Extra non-linear layer
        self.mlm_decoder.add(
            nn.Dense(units=self.backbone_model.units,
                     in_units=self.backbone_model.units,
                     flatten=False,
                     weight_initializer=weight_initializer,
                     bias_initializer=bias_initializer,
                     dtype=self.backbone_model.dtype))
        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
        # use basic layer normalization for pretaining
        self.mlm_decoder.add(
            nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
                         in_channels=self.backbone_model.units))
        # only load the dense weights with a re-initialized bias
        # parameters are stored in 'word_embed_bias' which is
        # not used in original embedding
        self.embedding_table = nn.Dense(
            units=self.backbone_model.vocab_size,
            in_units=self.backbone_model.embed_size,
            flatten=False,
            bias_initializer=bias_initializer,
            dtype=self.backbone_model.dtype)
        self.embedding_table.weight = self.backbone_model.word_embed.weight
        if self.backbone_model.embed_size != self.backbone_model.units:
            self.extra_table = nn.Dense(units=self.backbone_model.vocab_size,
                                        in_units=self.backbone_model.units -
                                        self.backbone_model.embed_size,
                                        flatten=False,
                                        use_bias=False,
                                        bias_initializer=bias_initializer,
                                        dtype=self.backbone_model.dtype)
示例#29
0
 def __init__(self,
              attention_cell='multi_head',
              num_layers=2,
              units=128,
              hidden_size=2048,
              max_length=50,
              num_heads=4,
              scaled=True,
              scale_embed=True,
              norm_inputs=True,
              dropout=0.0,
              use_residual=True,
              output_attention=False,
              weight_initializer=None,
              bias_initializer='zeros',
              prefix=None,
              params=None):
     super().__init__(prefix=prefix, params=params)
     assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
                                    'exactly by the number of heads. Received units={}, ' \
                                    'num_heads={}'.format(units, num_heads)
     self._num_layers = num_layers
     self._units = units
     self._hidden_size = hidden_size
     self._num_states = num_heads
     self._max_length = max_length
     self._dropout = dropout
     self._use_residual = use_residual
     self._output_attention = output_attention
     self._scaled = scaled
     self._scale_embed = scale_embed
     self._norm_inputs = norm_inputs
     with self.name_scope():
         if dropout:
             self.dropout_layer = nn.Dropout(rate=dropout)
         if self._norm_inputs:
             self.layer_norm = nn.LayerNorm()
         encoding = _position_encoding_init(max_length, units)
         self.position_weight = self.params.get_constant(
             'const', encoding.astype(np.float32))
         self.transformer_cells = nn.HybridSequential()
         for i in range(num_layers):
             self.transformer_cells.add(
                 TransformerDecoderCell(
                     units=units,
                     hidden_size=hidden_size,
                     num_heads=num_heads,
                     attention_cell=attention_cell,
                     weight_initializer=weight_initializer,
                     bias_initializer=bias_initializer,
                     dropout=dropout,
                     scaled=scaled,
                     use_residual=use_residual,
                     output_attention=output_attention,
                     prefix='transformer%d_' % i))
示例#30
0
 def __init__(self, model_dim, dropout =0.1):
     super(Resblock, self).__init__()
     self.model_dim = model_dim
     self.dropout = dropout
     self.resblock = nn.Sequential()
     with self.resblock.name_scope():
         self.resblock.add(nn.LayerNorm())
         self.resblock.add(nn.Dense(2*self.model_dim,in_units= self.model_dim,activation="relu"))
         self.resblock.add(nn.Dropout(self.dropout))
         self.resblock.add(nn.Dense(self.model_dim,in_units = 2*self.model_dim))
         self.resblock.add(nn.Dropout(self.dropout))