def __init__(self, units, num_heads, use_bias=True, dtype='float32', weight_initializer=None, bias_initializer=None): """Multiple Dense with different parameters and the same number of units The inner shapes of the weight and bias are weight: (self._parallel_num[0] * ... * self._parallel_num[k] * units, in_units) bias: (self._parallel_num[0] * ... * self._parallel_num[k],) Parameters ---------- units : int The basic units. num_heads : int or tuple use_bias : bool, default True dtype : str, default 'float32' The data type weight_initializer : None or initialzer, default None bias_initializer : None or initializer, default None """ super().__init__() if not isinstance(num_heads, (list, tuple)): num_heads = (int(num_heads), ) else: num_heads = tuple(num_heads) self._num_heads = num_heads self._use_bias = use_bias for ele in self._num_heads: if ele <= 0: raise ValueError( 'Invalid number of heads, all numbers need to be larger than 0.' ' num_heads={}'.format(num_heads)) self._units = units self._mult = np.prod(num_heads) self.weight = Parameter('weight', shape=(self._mult * units, 0), init=weight_initializer, dtype=dtype, allow_deferred_init=True) if use_bias: self.bias = Parameter('bias', shape=(self._mult * units, ), init=bias_initializer, dtype=dtype, allow_deferred_init=True) else: self.bias = None
def __init__(self, num_features, num_classes, ctx, scale=15): super(NormLinear, self).__init__() self.num_classes = num_classes self.scale = scale with self.name_scope(): self.weight = Parameter('norm_weight', shape=(num_classes, num_features)) self.weight.initialize(init.Xavier(magnitude=2.24), ctx=ctx)
def optimize(args): """ Gatys et al. CVPR 2017 ref: Image Style Transfer Using Convolutional Neural Networks """ if args.cuda: ctx = mx.gpu(0) else: ctx = mx.cpu(0) # load the content and style target content_image = utils.tensor_load_rgbimage(args.content_image, ctx, size=args.content_size, keep_asp=True) content_image = utils.subtract_imagenet_mean_preprocess_batch( content_image) style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size) style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image) # load the pre-trained vgg-16 and extract features vgg = net.Vgg16() utils.init_vgg_params(vgg, 'models', ctx=ctx) # content feature f_xc_c = vgg(content_image)[1] # style feature features_style = vgg(style_image) gram_style = [net.gram_matrix(y) for y in features_style] # output output = Parameter('output', shape=content_image.shape) output.initialize(ctx=ctx) output.set_data(content_image) # optimizer trainer = gluon.Trainer([output], 'adam', {'learning_rate': args.lr}) mse_loss = gluon.loss.L2Loss() # optimizing the images for e in range(args.iters): utils.imagenet_clamp_batch(output.data(), 0, 255) # fix BN for pre-trained vgg with autograd.record(): features_y = vgg(output.data()) content_loss = 2 * args.content_weight * mse_loss( features_y[1], f_xc_c) style_loss = 0. for m in range(len(features_y)): gram_y = net.gram_matrix(features_y[m]) gram_s = gram_style[m] style_loss = style_loss + 2 * args.style_weight * mse_loss( gram_y, gram_s) total_loss = content_loss + style_loss total_loss.backward() trainer.step(1) if (e + 1) % args.log_interval == 0: print('loss:{:.2f}'.format(total_loss.asnumpy()[0])) # save the image output = utils.add_imagenet_mean_batch(output.data()) utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
def __init__(self, in_channels, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', **kwargs): super().__init__(**kwargs) self._kwargs = {'center': center, 'scale': scale} self._in_channels = in_channels self.gamma = Parameter('gamma', grad_req='write' if scale else 'null', shape=(in_channels, ), init=gamma_initializer) self.beta = Parameter('beta', grad_req='write' if center else 'null', shape=(in_channels, ), init=beta_initializer)
def test_check_gradient(): # test check gradient on a simple function ctx = mx.cpu() w = Parameter(name='w', shape=(2, 3)) w.initialize('zeros', ctx) w.set_data(nd.array([[1., 2., 3], [-1., -3., 1.5]])) def f(): return nd.sum(nd.square(w.data())) assert check_gradient(f, [], w)
def __init__(self, num_layers=3, units=512, hidden_size=2048, num_heads=8, activation_dropout=0.1, dropout=0.1, attention_dropout=0.0, layernorm_eps=1E-12, activation='relu', dtype='float32', layout='NT', pre_norm=False, weight_initializer=None, bias_initializer=None): super().__init__() self.query_k_bias = Parameter('query_k_bias', shape=(num_heads, units // num_heads), init=bias_initializer, allow_deferred_init=True) self.query_r_bias = Parameter('query_r_bias', shape=(num_heads, units // num_heads), init=bias_initializer, allow_deferred_init=True) self.decoder_layers = nn.HybridSequential() for i in range(num_layers): self.decoder_layers.add( TransformerXLDecoderLayer( units=units, hidden_size=hidden_size, num_heads=num_heads, activation_dropout=activation_dropout, dropout=dropout, attention_dropout=attention_dropout, layer_norm_eps=layernorm_eps, activation=activation, dtype=dtype, layout=layout, pre_norm=pre_norm, weight_initializer=weight_initializer, bias_initializer=bias_initializer))
def __init__(self, in_channels, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', variance_epsilon=1E-6, dtype='float32', **kwargs): super().__init__() self._kwargs = {'center': center, 'scale': scale} self._in_channels = in_channels self._epsilon = variance_epsilon self.gamma = Parameter('gamma', grad_req='write' if scale else 'null', shape=(in_channels, ), init=gamma_initializer, dtype=dtype) self.beta = Parameter('beta', grad_req='write' if center else 'null', shape=(in_channels, ), init=beta_initializer, dtype=dtype)
def __init__(self, units, max_length, mode='clip', dtype='float32', weight_initializer=None): super().__init__() self._units = units self._dtype = dtype self._max_length = max_length self._mode = mode self.weight = Parameter('weight', shape=(max_length, units), init=weight_initializer, dtype=dtype, allow_deferred_init=True)
def __init__(self, units, bidirectional=True, num_buckets=32, max_distance=128, dtype='float32', embed_initializer=None): super().__init__() self._units = units self._bidirectional = bidirectional self._num_buckets = num_buckets self._max_distance = max_distance self._dtype = dtype self.weight = Parameter('weight', shape=(num_buckets, units), init=embed_initializer, dtype=dtype, allow_deferred_init=True)
def __init__(self, vocab_size: int, embed_size: int, units: int, cutoffs: Optional[Union[int, List]] = None, div_val: float = 1.0, dtype='float32', scaled=True, embedding_initializer: InitializerType = None, weight_initializer: InitializerType = None): """ Parameters ---------- vocab_size The size of the vocabulary embed_size The base size of the embedding vectors. The embedding size of each cluster will be [embed_size / div_val**0, embed_size / div_val**1, embed_size / div_val**2, ...] units The number of units after the mapping cutoffs The cutoffs to slice the vocab to multiple clusters. It should be a sorted list. Each value should be between 1 --> vocab_size - 1. div_val The base denominator for computing the size of the embedding vector in each cluster. dtype The data type of layer scaled Whether to scale the embedding by sqrt(units) embedding_initializer Initializer of the embedding vectors weight_initializer Initializer of projection layers bias_initializer Initializer of the bias """ super().__init__() cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size) if cutoffs is None: assert div_val == 1.0 self._dtype = dtype self._kwargs = OrderedDict([('cutoffs', cutoffs), ('vocab_size', vocab_size), ('embed_size', embed_size), ('units', units), ('div_val', div_val), ('dtype', dtype), ('scaled', scaled)]) self._vocab_size = vocab_size self._cutoffs = cutoffs self._units = units self._embed_size = embed_size self._div_val = div_val self._scaled = scaled if self._scaled: self._emb_scale = units**0.5 if div_val == 1.0: self.embed0_weight = Parameter('embed0_weight', shape=(vocab_size, embed_size), init=embedding_initializer, allow_deferred_init=True) if units != embed_size: self.inter_proj0_weight = Parameter('inter_proj0_weight', shape=(embed_size, units), init=weight_initializer, allow_deferred_init=True) else: self.proj_layers = None else: self.proj_layers = HybridSequential() for i, (l_idx, r_idx) in enumerate( zip([0] + cutoffs, cutoffs + [vocab_size])): inner_embed_size = int(embed_size / div_val**i) if inner_embed_size == 0: raise ValueError( 'div_val = {} is too large for the layer. Currently, the ' 'cutoffs are {} and the embed_size is {}. Using the ' 'div_val = {} will cause some clusters to have ' 'embed_size=0.'.format(div_val, cutoffs, embed_size, div_val)) setattr( self, 'embed{}_weight'.format(i), Parameter('embed{}_weight'.format(i), shape=(r_idx - l_idx, inner_embed_size), init=embedding_initializer, allow_deferred_init=True)) setattr( self, 'inter_proj{}_weight'.format(i), Parameter('inter_proj{}_weight'.format(i), shape=(inner_embed_size, units), init=weight_initializer, allow_deferred_init=True))
def __init__( self, d_hidden: int, kernel_sizes: List[int], n_head: int = 1, bias: bool = True, bidirectional: bool = False, dist_enc: Optional[str] = None, share_values: bool = False, dropout: float = 0.0, temperature: float = 1.0, **kwargs, ): """ Self-attention module with q,k,v from the same input Parameters ---------- d_hidden : int hidden dimension kernel_sizes: int kernel sizes of convolutions to generate queries and keys n_head : int, optional number of attention heads, by default 1 bias : bool, optional add bias term in input and output projections, by default True bidirectional : bool, optional if False, add a mask to avoid backward attention, by default False dist_enc : Optional[str], optional add relative distance embeddings to dot-product attention, can be 'add' (linearly combine key and dist), 'dot' (dot product between key and dist), or None (disabled), by default None share_values : bool, optional if True, a value reprensentation is shared by all attention heads, by default False ref. https://arxiv.org/abs/1912.09363 dropout : float, optional dropout rate, by default 0.0 temperature : float, optional softmax temperature, by default 1.0 """ super(SelfAttention, self).__init__(**kwargs) n_groups = len(kernel_sizes) assert ( d_hidden % n_head == 0 ), f"hidden dim {d_hidden} cannot be split into {n_head} heads." assert ( d_hidden % n_groups == 0 ), f"hidden dim {d_hidden} cannot be split into {n_groups} groups." assert ( n_head % n_groups == 0 ), f"num_heads {n_heads} cannot be allocated for {n_groups} groups." self.d_hidden = d_hidden self.kernel_sizes = kernel_sizes self.n_groups = n_groups self.d_group = self.d_hidden // self.n_groups self.n_head = n_head self.d_head = self.d_hidden // self.n_head self.bias = bias self.dist_enc = dist_enc self.bidirectional = bidirectional self.share_values = share_values self.temperature = temperature with self.name_scope(): self.qk_proj = HybridConcurrent(axis=-1, prefix="qk_proj_") for ksize in self.kernel_sizes: self.qk_proj.add( CausalConv1D( channels=self.d_group * 2, kernel_size=ksize, prefix=f"conv{ksize}_", )) self.v_proj = nn.Dense( units=self.d_head if self.share_values else d_hidden, use_bias=bias, flatten=False, weight_initializer=init.Xavier(), prefix="v_proj_", ) self.out_proj = nn.Dense( units=d_hidden, use_bias=bias, flatten=False, weight_initializer=init.Xavier(), prefix="out_proj_", ) if self.dist_enc is not None: assert self.dist_enc in [ "dot", "add", ], f"distance encoding type {self.dist_enc} is not supported" self.posemb = SinusoidalPositionalEmbedding(d_hidden) self.pos_proj = nn.Dense( units=d_hidden, use_bias=bias, flatten=False, weight_initializer=init.Xavier(), prefix="pos_proj_", ) if self.dist_enc == "add": self._ctt_bias_weight = Parameter( "_ctt_bias_weight", shape=(1, n_head, 1, self.d_head), init=init.Xavier(), ) self._pos_bias_weight = Parameter( "_pos_bias_weight", shape=(1, n_head, 1, self.d_head), init=init.Xavier(), ) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, epsilon, dtype): super().__init__() self.gemma = Parameter('layernorm_weight', shape=d_model, init='ones', dtype=dtype) self.variance_epsilon = epsilon