def optimize(args): """ Gatys et al. CVPR 2017 ref: Image Style Transfer Using Convolutional Neural Networks """ if args.cuda: ctx = mx.gpu(0) else: ctx = mx.cpu(0) # load the content and style target content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True) content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image) style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size) style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image) # load the pre-trained vgg-16 and extract features vgg = net.Vgg16() utils.init_vgg_params(vgg, 'models', ctx=ctx) # content feature f_xc_c = vgg(content_image)[1] # style feature features_style = vgg(style_image) gram_style = [net.gram_matrix(y) for y in features_style] # output output = Parameter('output', shape=content_image.shape) output.initialize(ctx=ctx) output.set_data(content_image) # optimizer trainer = gluon.Trainer([output], 'adam', {'learning_rate': args.lr}) mse_loss = gluon.loss.L2Loss() # optimizing the images for e in range(args.iters): utils.imagenet_clamp_batch(output.data(), 0, 255) # fix BN for pre-trained vgg with autograd.record(): features_y = vgg(output.data()) content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c) style_loss = 0. for m in range(len(features_y)): gram_y = net.gram_matrix(features_y[m]) gram_s = gram_style[m] style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s) total_loss = content_loss + style_loss total_loss.backward() trainer.step(1) if (e + 1) % args.log_interval == 0: print('loss:{:.2f}'.format(total_loss.asnumpy()[0])) # save the image output = utils.add_imagenet_mean_batch(output.data()) utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
def __init__( self, d_hidden: int, kernel_sizes: List[int], n_head: int = 1, bias: bool = True, bidirectional: bool = False, dist_enc: Optional[str] = None, share_values: bool = False, dropout: float = 0.0, temperature: float = 1.0, **kwargs, ): """ Self-attention module with q,k,v from the same input Parameters ---------- d_hidden : int hidden dimension kernel_sizes: int kernel sizes of convolutions to generate queries and keys n_head : int, optional number of attention heads, by default 1 bias : bool, optional add bias term in input and output projections, by default True bidirectional : bool, optional if False, add a mask to avoid backward attention, by default False dist_enc : Optional[str], optional add relative distance embeddings to dot-product attention, can be 'add' (linearly combine key and dist), 'dot' (dot product between key and dist), or None (disabled), by default None share_values : bool, optional if True, a value reprensentation is shared by all attention heads, by default False ref. https://arxiv.org/abs/1912.09363 dropout : float, optional dropout rate, by default 0.0 temperature : float, optional softmax temperature, by default 1.0 """ super(SelfAttention, self).__init__(**kwargs) n_groups = len(kernel_sizes) assert ( d_hidden % n_head == 0 ), f"hidden dim {d_hidden} cannot be split into {n_head} heads." assert ( d_hidden % n_groups == 0 ), f"hidden dim {d_hidden} cannot be split into {n_groups} groups." assert ( n_head % n_groups == 0 ), f"num_heads {n_heads} cannot be allocated for {n_groups} groups." self.d_hidden = d_hidden self.kernel_sizes = kernel_sizes self.n_groups = n_groups self.d_group = self.d_hidden // self.n_groups self.n_head = n_head self.d_head = self.d_hidden // self.n_head self.bias = bias self.dist_enc = dist_enc self.bidirectional = bidirectional self.share_values = share_values self.temperature = temperature with self.name_scope(): self.qk_proj = HybridConcurrent(axis=-1, prefix="qk_proj_") for ksize in self.kernel_sizes: self.qk_proj.add( CausalConv1D( channels=self.d_group * 2, kernel_size=ksize, prefix=f"conv{ksize}_", )) self.v_proj = nn.Dense( units=self.d_head if self.share_values else d_hidden, use_bias=bias, flatten=False, weight_initializer=init.Xavier(), prefix="v_proj_", ) self.out_proj = nn.Dense( units=d_hidden, use_bias=bias, flatten=False, weight_initializer=init.Xavier(), prefix="out_proj_", ) if self.dist_enc is not None: assert self.dist_enc in [ "dot", "add", ], f"distance encoding type {self.dist_enc} is not supported" self.posemb = SinusoidalPositionalEmbedding(d_hidden) self.pos_proj = nn.Dense( units=d_hidden, use_bias=bias, flatten=False, weight_initializer=init.Xavier(), prefix="pos_proj_", ) if self.dist_enc == "add": self._ctt_bias_weight = Parameter( "_ctt_bias_weight", shape=(1, n_head, 1, self.d_head), init=init.Xavier(), ) self._pos_bias_weight = Parameter( "_pos_bias_weight", shape=(1, n_head, 1, self.d_head), init=init.Xavier(), ) self.dropout = nn.Dropout(dropout)
class NoNorm(HybridBlock): r""" Apply an element-wise linear transformation to the n-dimensional input array. replacing the layer normalization. .. math:: out = \gmmma \circ data + \beta Parameters ---------- in_channels : int Number of channels (feature maps) in input data. If not specified, initialization will be deferred to the first time `forward` is called center: bool, default True If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: bool, default True If True, multiply by `gamma`. If False, `gamma` is not used. beta_initializer: str or `Initializer`, default 'zeros' Initializer for the beta weight. gamma_initializer: str or `Initializer`, default 'ones' Initializer for the gamma weight. Inputs: - **data**: input tensor with arbitrary shape. Outputs: - **out**: output tensor with the same shape as `data`. References ---------- `MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices <https://arxiv.org/pdf/2004.02984.pdf>`_ Examples -------- >>> # Input of shape (2, 5) >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]]) >>> # Layer normalization is calculated with the above formula >>> layer = NoNorm(in_channels=5) >>> layer.initialize(ctx=mx.cpu(0)) >>> layer(x) array([[1., 2., 3., 4., 5.], [1., 1., 2., 2., 2.]]) """ def __init__(self, in_channels, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', dtype='float32', **kwargs): super().__init__(**kwargs) self._kwargs = {'center': center, 'scale': scale} self._in_channels = in_channels self.gamma = Parameter('gamma', grad_req='write' if scale else 'null', shape=(in_channels, ), init=gamma_initializer, dtype=dtype) self.beta = Parameter('beta', grad_req='write' if center else 'null', shape=(in_channels, ), init=beta_initializer, dtype=dtype) def forward(self, data): return data * self.gamma.data() + self.beta.data() def __repr__(self): s = '{name}({content}' in_channels = self.gamma.shape[0] s += ', in_channels={0}'.format(in_channels) s += ')' return s.format(name=self.__class__.__name__, content=', '.join([ '='.join([k, v.__repr__()]) for k, v in self._kwargs.items() ]))
def __init__(self, vocab_size: int, embed_size: int, units: int, cutoffs: Optional[Union[int, List]] = None, div_val: float = 1.0, dtype='float32', scaled=True, embedding_initializer: InitializerType = None, weight_initializer: InitializerType = None): """ Parameters ---------- vocab_size The size of the vocabulary embed_size The base size of the embedding vectors. The embedding size of each cluster will be [embed_size / div_val**0, embed_size / div_val**1, embed_size / div_val**2, ...] units The number of units after the mapping cutoffs The cutoffs to slice the vocab to multiple clusters. It should be a sorted list. Each value should be between 1 --> vocab_size - 1. div_val The base denominator for computing the size of the embedding vector in each cluster. dtype The data type of layer scaled Whether to scale the embedding by sqrt(units) embedding_initializer Initializer of the embedding vectors weight_initializer Initializer of projection layers bias_initializer Initializer of the bias """ super().__init__() cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size) if cutoffs is None: assert div_val == 1.0 self._dtype = dtype self._kwargs = OrderedDict([('cutoffs', cutoffs), ('vocab_size', vocab_size), ('embed_size', embed_size), ('units', units), ('div_val', div_val), ('dtype', dtype), ('scaled', scaled)]) self._vocab_size = vocab_size self._cutoffs = cutoffs self._units = units self._embed_size = embed_size self._div_val = div_val self._scaled = scaled if self._scaled: self._emb_scale = units**0.5 if div_val == 1.0: self.embed0_weight = Parameter('embed0_weight', shape=(vocab_size, embed_size), init=embedding_initializer, allow_deferred_init=True) if units != embed_size: self.inter_proj0_weight = Parameter('inter_proj0_weight', shape=(embed_size, units), init=weight_initializer, allow_deferred_init=True) else: self.proj_layers = None else: self.proj_layers = nn.HybridSequential() for i, (l_idx, r_idx) in enumerate( zip([0] + cutoffs, cutoffs + [vocab_size])): inner_embed_size = int(embed_size / div_val**i) if inner_embed_size == 0: raise ValueError( 'div_val = {} is too large for the layer. Currently, the ' 'cutoffs are {} and the embed_size is {}. Using the ' 'div_val = {} will cause some clusters to have ' 'embed_size=0.'.format(div_val, cutoffs, embed_size, div_val)) setattr( self, 'embed{}_weight'.format(i), Parameter('embed{}_weight'.format(i), shape=(r_idx - l_idx, inner_embed_size), init=embedding_initializer, allow_deferred_init=True)) setattr( self, 'inter_proj{}_weight'.format(i), Parameter('inter_proj{}_weight'.format(i), shape=(inner_embed_size, units), init=weight_initializer, allow_deferred_init=True))
def __init__(self, d_model, epsilon, dtype): super().__init__() self.gemma = Parameter('layernorm_weight', shape=d_model, init='ones', dtype=dtype) self.variance_epsilon = epsilon
class TransformerXLDecoder(HybridBlock): def __init__(self, num_layers=3, units=512, hidden_size=2048, num_heads=8, activation_dropout=0.1, dropout=0.1, attention_dropout=0.0, layernorm_eps=1E-12, activation='relu', dtype='float32', layout='NT', pre_norm=False, weight_initializer=None, bias_initializer=None): super().__init__() self.query_k_bias = Parameter('query_k_bias', shape=(num_heads, units // num_heads), init=bias_initializer, allow_deferred_init=True) self.query_r_bias = Parameter('query_r_bias', shape=(num_heads, units // num_heads), init=bias_initializer, allow_deferred_init=True) self.decoder_layers = HybridSequential() for i in range(num_layers): self.decoder_layers.add( TransformerXLDecoderLayer( units=units, hidden_size=hidden_size, num_heads=num_heads, activation_dropout=activation_dropout, dropout=dropout, attention_dropout=attention_dropout, layer_norm_eps=layernorm_eps, activation=activation, dtype=dtype, layout=layout, pre_norm=pre_norm, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) def forward(self, data, mem_l, rel_positions, mask): """ Parameters ---------- F data - layout = 'NT': Shape (batch_size, query_length) - layout = 'TN': Shape (query_length, batch_size) mem_l Contains a list of memory objects, each one will contain: - layout = 'NT': Shape (batch_size, mem_length, C_i) - layout = 'TN': Shape (mem_length, batch_size, C_i) rel_positions The relative positions. Shape (query_length, mem_length + query_length) mask Mask between the query and the memory + query. Shape (batch_size, query_length, mem_length + query_length) Returns ------- out_l Contains a list of hidden states, each will contain: - layout = 'NT' Shape (batch_size, query_length, C_o) - layout = 'TN' Shape (query_length, batch_size, C_o) """ query_k_bias = self.query_k_bias.data() query_r_bias = self.query_r_bias.data() out_l = [] out = data for i, layer in enumerate(self.decoder_layers): out = layer(out, mem_l[i], rel_positions, mask, query_r_bias, query_k_bias) out_l.append(out) return out_l