예제 #1
0
파일: layers.py 프로젝트: liuzh91/gluon-nlp
 def __init__(self,
              units,
              num_heads,
              use_bias=True,
              dtype='float32',
              weight_initializer=None,
              bias_initializer=None):
     """Multiple Dense with different parameters and the same number of units
     The inner shapes of the weight and bias are
         weight: (self._parallel_num[0] * ... * self._parallel_num[k] * units, in_units)
         bias: (self._parallel_num[0] * ... * self._parallel_num[k],)
     Parameters
     ----------
     units : int
         The basic units.
     num_heads : int or tuple
     use_bias : bool, default True
     dtype : str, default 'float32'
         The data type
     weight_initializer : None or initialzer, default None
     bias_initializer : None or initializer, default None
     """
     super().__init__()
     if not isinstance(num_heads, (list, tuple)):
         num_heads = (int(num_heads), )
     else:
         num_heads = tuple(num_heads)
     self._num_heads = num_heads
     self._use_bias = use_bias
     for ele in self._num_heads:
         if ele <= 0:
             raise ValueError(
                 'Invalid number of heads, all numbers need to be larger than 0.'
                 ' num_heads={}'.format(num_heads))
     self._units = units
     self._mult = np.prod(num_heads)
     self.weight = Parameter('weight',
                             shape=(self._mult * units, 0),
                             init=weight_initializer,
                             dtype=dtype,
                             allow_deferred_init=True)
     if use_bias:
         self.bias = Parameter('bias',
                               shape=(self._mult * units, ),
                               init=bias_initializer,
                               dtype=dtype,
                               allow_deferred_init=True)
     else:
         self.bias = None
예제 #2
0
 def __init__(self, num_features, num_classes, ctx, scale=15):
     super(NormLinear, self).__init__()
     self.num_classes = num_classes
     self.scale = scale
     with self.name_scope():
         self.weight = Parameter('norm_weight', shape=(num_classes, num_features))
         self.weight.initialize(init.Xavier(magnitude=2.24), ctx=ctx)
예제 #3
0
def optimize(args):
    """    Gatys et al. CVPR 2017
    ref: Image Style Transfer Using Convolutional Neural Networks
    """
    if args.cuda:
        ctx = mx.gpu(0)
    else:
        ctx = mx.cpu(0)
    # load the content and style target
    content_image = utils.tensor_load_rgbimage(args.content_image,
                                               ctx,
                                               size=args.content_size,
                                               keep_asp=True)
    content_image = utils.subtract_imagenet_mean_preprocess_batch(
        content_image)
    style_image = utils.tensor_load_rgbimage(args.style_image,
                                             ctx,
                                             size=args.style_size)
    style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image)
    # load the pre-trained vgg-16 and extract features
    vgg = net.Vgg16()
    utils.init_vgg_params(vgg, 'models', ctx=ctx)
    # content feature
    f_xc_c = vgg(content_image)[1]
    # style feature
    features_style = vgg(style_image)
    gram_style = [net.gram_matrix(y) for y in features_style]
    # output
    output = Parameter('output', shape=content_image.shape)
    output.initialize(ctx=ctx)
    output.set_data(content_image)
    # optimizer
    trainer = gluon.Trainer([output], 'adam', {'learning_rate': args.lr})
    mse_loss = gluon.loss.L2Loss()

    # optimizing the images
    for e in range(args.iters):
        utils.imagenet_clamp_batch(output.data(), 0, 255)
        # fix BN for pre-trained vgg
        with autograd.record():
            features_y = vgg(output.data())
            content_loss = 2 * args.content_weight * mse_loss(
                features_y[1], f_xc_c)
            style_loss = 0.
            for m in range(len(features_y)):
                gram_y = net.gram_matrix(features_y[m])
                gram_s = gram_style[m]
                style_loss = style_loss + 2 * args.style_weight * mse_loss(
                    gram_y, gram_s)
            total_loss = content_loss + style_loss
            total_loss.backward()

        trainer.step(1)
        if (e + 1) % args.log_interval == 0:
            print('loss:{:.2f}'.format(total_loss.asnumpy()[0]))

    # save the image
    output = utils.add_imagenet_mean_batch(output.data())
    utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
예제 #4
0
 def __init__(self,
              in_channels,
              center=True,
              scale=True,
              beta_initializer='zeros',
              gamma_initializer='ones',
              **kwargs):
     super().__init__(**kwargs)
     self._kwargs = {'center': center, 'scale': scale}
     self._in_channels = in_channels
     self.gamma = Parameter('gamma',
                            grad_req='write' if scale else 'null',
                            shape=(in_channels, ),
                            init=gamma_initializer)
     self.beta = Parameter('beta',
                           grad_req='write' if center else 'null',
                           shape=(in_channels, ),
                           init=beta_initializer)
예제 #5
0
def test_check_gradient():
    # test check gradient on a simple function
    ctx = mx.cpu()
    w = Parameter(name='w', shape=(2, 3))
    w.initialize('zeros', ctx)
    w.set_data(nd.array([[1., 2., 3], [-1., -3., 1.5]]))

    def f():
        return nd.sum(nd.square(w.data()))

    assert check_gradient(f, [], w)
예제 #6
0
 def __init__(self,
              num_layers=3,
              units=512,
              hidden_size=2048,
              num_heads=8,
              activation_dropout=0.1,
              dropout=0.1,
              attention_dropout=0.0,
              layernorm_eps=1E-12,
              activation='relu',
              dtype='float32',
              layout='NT',
              pre_norm=False,
              weight_initializer=None,
              bias_initializer=None):
     super().__init__()
     self.query_k_bias = Parameter('query_k_bias',
                                   shape=(num_heads, units // num_heads),
                                   init=bias_initializer,
                                   allow_deferred_init=True)
     self.query_r_bias = Parameter('query_r_bias',
                                   shape=(num_heads, units // num_heads),
                                   init=bias_initializer,
                                   allow_deferred_init=True)
     self.decoder_layers = nn.HybridSequential()
     for i in range(num_layers):
         self.decoder_layers.add(
             TransformerXLDecoderLayer(
                 units=units,
                 hidden_size=hidden_size,
                 num_heads=num_heads,
                 activation_dropout=activation_dropout,
                 dropout=dropout,
                 attention_dropout=attention_dropout,
                 layer_norm_eps=layernorm_eps,
                 activation=activation,
                 dtype=dtype,
                 layout=layout,
                 pre_norm=pre_norm,
                 weight_initializer=weight_initializer,
                 bias_initializer=bias_initializer))
예제 #7
0
 def __init__(self,
              in_channels,
              center=True,
              scale=True,
              beta_initializer='zeros',
              gamma_initializer='ones',
              variance_epsilon=1E-6,
              dtype='float32',
              **kwargs):
     super().__init__()
     self._kwargs = {'center': center, 'scale': scale}
     self._in_channels = in_channels
     self._epsilon = variance_epsilon
     self.gamma = Parameter('gamma',
                            grad_req='write' if scale else 'null',
                            shape=(in_channels, ),
                            init=gamma_initializer,
                            dtype=dtype)
     self.beta = Parameter('beta',
                           grad_req='write' if center else 'null',
                           shape=(in_channels, ),
                           init=beta_initializer,
                           dtype=dtype)
예제 #8
0
    def __init__(self,
                 units,
                 max_length,
                 mode='clip',
                 dtype='float32',
                 weight_initializer=None):
        super().__init__()
        self._units = units
        self._dtype = dtype
        self._max_length = max_length
        self._mode = mode

        self.weight = Parameter('weight',
                                shape=(max_length, units),
                                init=weight_initializer,
                                dtype=dtype,
                                allow_deferred_init=True)
예제 #9
0
 def __init__(self,
              units,
              bidirectional=True,
              num_buckets=32,
              max_distance=128,
              dtype='float32',
              embed_initializer=None):
     super().__init__()
     self._units = units
     self._bidirectional = bidirectional
     self._num_buckets = num_buckets
     self._max_distance = max_distance
     self._dtype = dtype
     self.weight = Parameter('weight',
                             shape=(num_buckets, units),
                             init=embed_initializer,
                             dtype=dtype,
                             allow_deferred_init=True)
예제 #10
0
    def __init__(self,
                 vocab_size: int,
                 embed_size: int,
                 units: int,
                 cutoffs: Optional[Union[int, List]] = None,
                 div_val: float = 1.0,
                 dtype='float32',
                 scaled=True,
                 embedding_initializer: InitializerType = None,
                 weight_initializer: InitializerType = None):
        """

        Parameters
        ----------
        vocab_size
            The size of the vocabulary
        embed_size
            The base size of the embedding vectors. The embedding size of each cluster will be
            [embed_size / div_val**0, embed_size / div_val**1, embed_size / div_val**2, ...]
        units
            The number of units after the mapping
        cutoffs
            The cutoffs to slice the vocab to multiple clusters. It should be a sorted list. Each
            value should be between 1 --> vocab_size - 1.
        div_val
            The base denominator for computing the size of the embedding vector in each cluster.
        dtype
            The data type of layer
        scaled
            Whether to scale the embedding by sqrt(units)
        embedding_initializer
            Initializer of the embedding vectors
        weight_initializer
            Initializer of projection layers
        bias_initializer
            Initializer of the bias
        """
        super().__init__()
        cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size)
        if cutoffs is None:
            assert div_val == 1.0
        self._dtype = dtype
        self._kwargs = OrderedDict([('cutoffs', cutoffs),
                                    ('vocab_size', vocab_size),
                                    ('embed_size', embed_size),
                                    ('units', units), ('div_val', div_val),
                                    ('dtype', dtype), ('scaled', scaled)])
        self._vocab_size = vocab_size
        self._cutoffs = cutoffs
        self._units = units
        self._embed_size = embed_size
        self._div_val = div_val
        self._scaled = scaled
        if self._scaled:
            self._emb_scale = units**0.5
        if div_val == 1.0:
            self.embed0_weight = Parameter('embed0_weight',
                                           shape=(vocab_size, embed_size),
                                           init=embedding_initializer,
                                           allow_deferred_init=True)

            if units != embed_size:
                self.inter_proj0_weight = Parameter('inter_proj0_weight',
                                                    shape=(embed_size, units),
                                                    init=weight_initializer,
                                                    allow_deferred_init=True)
            else:
                self.proj_layers = None
        else:
            self.proj_layers = HybridSequential()
            for i, (l_idx, r_idx) in enumerate(
                    zip([0] + cutoffs, cutoffs + [vocab_size])):
                inner_embed_size = int(embed_size / div_val**i)
                if inner_embed_size == 0:
                    raise ValueError(
                        'div_val = {} is too large for the layer. Currently, the '
                        'cutoffs are {} and the embed_size is {}. Using the '
                        'div_val = {} will cause some clusters to have '
                        'embed_size=0.'.format(div_val, cutoffs, embed_size,
                                               div_val))
                setattr(
                    self, 'embed{}_weight'.format(i),
                    Parameter('embed{}_weight'.format(i),
                              shape=(r_idx - l_idx, inner_embed_size),
                              init=embedding_initializer,
                              allow_deferred_init=True))
                setattr(
                    self, 'inter_proj{}_weight'.format(i),
                    Parameter('inter_proj{}_weight'.format(i),
                              shape=(inner_embed_size, units),
                              init=weight_initializer,
                              allow_deferred_init=True))
예제 #11
0
    def __init__(
        self,
        d_hidden: int,
        kernel_sizes: List[int],
        n_head: int = 1,
        bias: bool = True,
        bidirectional: bool = False,
        dist_enc: Optional[str] = None,
        share_values: bool = False,
        dropout: float = 0.0,
        temperature: float = 1.0,
        **kwargs,
    ):
        """
        Self-attention module with q,k,v from the same input

        Parameters
        ----------
        d_hidden : int
            hidden dimension
        kernel_sizes: int
            kernel sizes of convolutions to generate queries and keys
        n_head : int, optional
            number of attention heads, by default 1
        bias : bool, optional
            add bias term in input and output projections, by default True
        bidirectional : bool, optional
            if False, add a mask to avoid backward attention, by default False
        dist_enc : Optional[str], optional
            add relative distance embeddings to dot-product attention, can be 
                'add' (linearly combine key and dist),
                'dot' (dot product between key and dist), 
                or None (disabled),
            by default None
        share_values : bool, optional
            if True, a value reprensentation is shared by all attention heads, by default False
            ref. https://arxiv.org/abs/1912.09363
        dropout : float, optional
            dropout rate, by default 0.0
        temperature : float, optional
            softmax temperature, by default 1.0
        """
        super(SelfAttention, self).__init__(**kwargs)
        n_groups = len(kernel_sizes)
        assert (
            d_hidden % n_head == 0
        ), f"hidden dim {d_hidden} cannot be split into {n_head} heads."
        assert (
            d_hidden % n_groups == 0
        ), f"hidden dim {d_hidden} cannot be split into {n_groups} groups."
        assert (
            n_head % n_groups == 0
        ), f"num_heads {n_heads} cannot be allocated for {n_groups} groups."
        self.d_hidden = d_hidden
        self.kernel_sizes = kernel_sizes
        self.n_groups = n_groups
        self.d_group = self.d_hidden // self.n_groups
        self.n_head = n_head
        self.d_head = self.d_hidden // self.n_head
        self.bias = bias
        self.dist_enc = dist_enc
        self.bidirectional = bidirectional
        self.share_values = share_values
        self.temperature = temperature

        with self.name_scope():
            self.qk_proj = HybridConcurrent(axis=-1, prefix="qk_proj_")
            for ksize in self.kernel_sizes:
                self.qk_proj.add(
                    CausalConv1D(
                        channels=self.d_group * 2,
                        kernel_size=ksize,
                        prefix=f"conv{ksize}_",
                    ))
            self.v_proj = nn.Dense(
                units=self.d_head if self.share_values else d_hidden,
                use_bias=bias,
                flatten=False,
                weight_initializer=init.Xavier(),
                prefix="v_proj_",
            )
            self.out_proj = nn.Dense(
                units=d_hidden,
                use_bias=bias,
                flatten=False,
                weight_initializer=init.Xavier(),
                prefix="out_proj_",
            )

            if self.dist_enc is not None:
                assert self.dist_enc in [
                    "dot",
                    "add",
                ], f"distance encoding type {self.dist_enc} is not supported"
                self.posemb = SinusoidalPositionalEmbedding(d_hidden)
                self.pos_proj = nn.Dense(
                    units=d_hidden,
                    use_bias=bias,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                    prefix="pos_proj_",
                )
                if self.dist_enc == "add":
                    self._ctt_bias_weight = Parameter(
                        "_ctt_bias_weight",
                        shape=(1, n_head, 1, self.d_head),
                        init=init.Xavier(),
                    )
                    self._pos_bias_weight = Parameter(
                        "_pos_bias_weight",
                        shape=(1, n_head, 1, self.d_head),
                        init=init.Xavier(),
                    )

            self.dropout = nn.Dropout(dropout)
예제 #12
0
 def __init__(self, d_model, epsilon, dtype): 
     super().__init__()
     self.gemma = Parameter('layernorm_weight', shape=d_model, init='ones', dtype=dtype)
     self.variance_epsilon = epsilon