def optimize(args): """ Gatys et al. CVPR 2017 ref: Image Style Transfer Using Convolutional Neural Networks """ if args.cuda: ctx = mx.gpu(0) else: ctx = mx.cpu(0) # load the content and style target content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True) content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image) style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size) style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image) # load the pre-trained vgg-16 and extract features vgg = net.Vgg16() utils.init_vgg_params(vgg, 'models', ctx=ctx) # content feature f_xc_c = vgg(content_image)[1] # style feature features_style = vgg(style_image) gram_style = [net.gram_matrix(y) for y in features_style] # output output = Parameter('output', shape=content_image.shape) output.initialize(ctx=ctx) output.set_data(content_image) # optimizer trainer = gluon.Trainer([output], 'adam', {'learning_rate': args.lr}) mse_loss = gluon.loss.L2Loss() # optimizing the images for e in range(args.iters): utils.imagenet_clamp_batch(output.data(), 0, 255) # fix BN for pre-trained vgg with autograd.record(): features_y = vgg(output.data()) content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c) style_loss = 0. for m in range(len(features_y)): gram_y = net.gram_matrix(features_y[m]) gram_s = gram_style[m] style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s) total_loss = content_loss + style_loss total_loss.backward() trainer.step(1) if (e + 1) % args.log_interval == 0: print('loss:{:.2f}'.format(total_loss.asnumpy()[0])) # save the image output = utils.add_imagenet_mean_batch(output.data()) utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
class LearnedPositionalEmbedding(HybridBlock): def __init__(self, units, max_length, mode='clip', dtype='float32', weight_initializer=None): super().__init__() self._units = units self._dtype = dtype self._max_length = max_length self._mode = mode self.weight = Parameter('weight', shape=(max_length, units), init=weight_initializer, dtype=dtype, allow_deferred_init=True) def __repr__(self): s = '{name}(units={units}, max_length={max_length}, mode={mode}, dtype={dtype})' return s.format(name=self.__class__.__name__, units=self._units, max_length=self._max_length, mode=self._mode, dtype=self._dtype) def forward(self, positions): return np.take(self.weight.data(), positions, axis=0, mode=self._mode)
class RMSNorm(HybridBlock): """Apply root mean square layer normalization to n-dimensional input array, where we do not substract mean in the numerator. For more details, see the paper: https://arxiv.org/pdf/1910.07467.pdf """ def __init__(self, in_channels, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', variance_epsilon=1E-6, dtype='float32', **kwargs): super().__init__() self._kwargs = {'center': center, 'scale': scale} self._in_channels = in_channels self._epsilon = variance_epsilon self.gamma = Parameter('gamma', grad_req='write' if scale else 'null', shape=(in_channels, ), init=gamma_initializer, dtype=dtype) self.beta = Parameter('beta', grad_req='write' if center else 'null', shape=(in_channels, ), init=beta_initializer, dtype=dtype) def forward(self, data): var = np.power(data, 2).mean(-1, keepdims=True) data = data * np.reciprocal(np.sqrt(var + self._epsilon)) return data * self.gamma.data() + self.beta.data() def __repr__(self): s = '{name}({content}' in_channels = self.gamma.shape[0] s += ', in_channels={0}'.format(in_channels) s += ')' return s.format(name=self.__class__.__name__, content=', '.join([ '='.join([k, v.__repr__()]) for k, v in self._kwargs.items() ]))
def check_gradient(forward_fn, fn_params: List[mx.ndarray.NDArray], wrt: Parameter, seed=None, eps=3e-4, tol=1e-2) -> bool: """ Check autograd backward for a given function using finite differencing. :param forward_fn: The function to test the gradients of. This function should return a scalar. :param fn_params: A list of parameters to call the function. :param wrt: The parameter with respect to which we take the gradient. :param seed: Random seed for mxnet and numpy. Note that the forward function might be stochastic. We reinitialize the seed to the same number before every forward function call. :param eps: Epsilon used in finite differencing. The default value is taken from theano's verify_grad function. :param tol: Absolute and relative tolerance used to check equality. Again, the default value is taken from theano's verify_grad function. :return: True if check succeeds. """ if seed is None: seed = int(np.random.rand() * 1e6) # calculate gradient with autograd mx.random.seed(seed) np.random.seed(seed) with autograd.record(): out = forward_fn(*fn_params) autograd.backward(out) ag_grad = wrt.grad().asnumpy() # calculate gradient by finite difference orig_data = wrt.data().asnumpy() fd_grad = np.zeros_like(orig_data) for i in range(orig_data.size): ix = np.unravel_index(i, orig_data.shape) # f(x + h) orig_data[ix] += eps wrt.set_data(orig_data) mx.random.seed(seed) np.random.seed(seed) out_ph = forward_fn(*fn_params).asscalar() # f(x - h) orig_data[ix] -= (2 * eps) wrt.set_data(orig_data) mx.random.seed(seed) np.random.seed(seed) out_mh = forward_fn(*fn_params).asscalar() orig_data[ix] += eps # revert # calc gradient fd_grad[ix] = (out_ph - out_mh) / (2 * eps) return np.allclose(ag_grad, fd_grad, atol=tol, rtol=tol)
def print_top_words(weight: gluon.Parameter, id2word: dict, top: int = 10) -> None: n_factors, vocab_size = weight.shape weight = weight.data().asnumpy() for factor_idx in range(n_factors): top_word_indices = np.argsort(weight[factor_idx])[::-1][0:top] logger.info('----------') logger.info('factor %d:' % factor_idx) for word_idx in top_word_indices: logger.info('%.3e\t%s' % (weight[factor_idx, word_idx], id2word[word_idx]))
def print_nearest_cosine_distance(embeddings: gluon.Parameter, id2word: dict, num: int = 10) -> None: embeddings = embeddings.data().asnumpy().T top_wordids = list(id2word.keys())[0:num] distances = sklearn.metrics.pairwise.cosine_similarity( embeddings[top_wordids], embeddings) for idx, distance in zip(top_wordids, distances): top_word_indices = np.argsort(distance)[::-1][1:11] logger.info('----------') logger.info("nearest words in cosine distance to: %s" % id2word[idx]) for nearest in top_word_indices: logger.info('%.3e\t%s' % (distance[nearest], id2word[nearest]))
class BucketPositionalEmbedding(HybridBlock): """Divide the positional space into buckets and assign the relative positions within each bucket to the same value. For positions that are out-of-the-boundary, they are treated as falling into one bucket. This is used in the T5 paper: "[Arxiv2019] Exploring the limits of transfer learning with a unified text-to-text transformer", Here, the first half of the buckets handles the small shifts and the second half of the buckets handles the large shifts (mapping them in logarithmically separated bins). """ def __init__(self, units, bidirectional=True, num_buckets=32, max_distance=128, dtype='float32', embed_initializer=None): super().__init__() self._units = units self._bidirectional = bidirectional self._num_buckets = num_buckets self._max_distance = max_distance self._dtype = dtype self.weight = Parameter('weight', shape=(num_buckets, units), init=embed_initializer, dtype=dtype, allow_deferred_init=True) def __repr__(self): s = '{name}(units={units}, bidirectional={bidirectional}, num_buckets={num_buckets},' \ ' max_distance={max_distance}, dtype={dtype})' return s.format(name=self.__class__.__name__, units=self._units, bidirectional=self._bidirectional, num_buckets=self._num_buckets, max_distance=self._max_distance, dtype=self._dtype) def forward(self, relative_positions): buckets = relative_position_bucket(relative_positions, bidirectional=self._bidirectional, num_buckets=self._num_buckets, max_distance=self._max_distance) return np.take(self.weight.data(), buckets, axis=0)
class NoNorm(HybridBlock): r""" Apply an element-wise linear transformation to the n-dimensional input array. replacing the layer normalization. .. math:: out = \gmmma \circ data + \beta Parameters ---------- in_channels : int Number of channels (feature maps) in input data. If not specified, initialization will be deferred to the first time `forward` is called center: bool, default True If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: bool, default True If True, multiply by `gamma`. If False, `gamma` is not used. beta_initializer: str or `Initializer`, default 'zeros' Initializer for the beta weight. gamma_initializer: str or `Initializer`, default 'ones' Initializer for the gamma weight. Inputs: - **data**: input tensor with arbitrary shape. Outputs: - **out**: output tensor with the same shape as `data`. References ---------- `MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices <https://arxiv.org/pdf/2004.02984.pdf>`_ Examples -------- >>> # Input of shape (2, 5) >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]]) >>> # Layer normalization is calculated with the above formula >>> layer = NoNorm(in_channels=5) >>> layer.initialize(ctx=mx.cpu(0)) >>> layer(x) array([[1., 2., 3., 4., 5.], [1., 1., 2., 2., 2.]]) """ def __init__(self, in_channels, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', **kwargs): super().__init__(**kwargs) self._kwargs = {'center': center, 'scale': scale} self._in_channels = in_channels self.gamma = Parameter('gamma', grad_req='write' if scale else 'null', shape=(in_channels, ), init=gamma_initializer) self.beta = Parameter('beta', grad_req='write' if center else 'null', shape=(in_channels, ), init=beta_initializer) def forward(self, data): return data * self.gamma.data() + self.beta.data() def __repr__(self): s = '{name}({content}' in_channels = self.gamma.shape[0] s += ', in_channels={0}'.format(in_channels) s += ')' return s.format(name=self.__class__.__name__, content=', '.join([ '='.join([k, v.__repr__()]) for k, v in self._kwargs.items() ]))
class TransformerXLDecoder(HybridBlock): def __init__(self, num_layers=3, units=512, hidden_size=2048, num_heads=8, activation_dropout=0.1, dropout=0.1, attention_dropout=0.0, layernorm_eps=1E-12, activation='relu', dtype='float32', layout='NT', pre_norm=False, weight_initializer=None, bias_initializer=None): super().__init__() self.query_k_bias = Parameter('query_k_bias', shape=(num_heads, units // num_heads), init=bias_initializer, allow_deferred_init=True) self.query_r_bias = Parameter('query_r_bias', shape=(num_heads, units // num_heads), init=bias_initializer, allow_deferred_init=True) self.decoder_layers = nn.HybridSequential() for i in range(num_layers): self.decoder_layers.add( TransformerXLDecoderLayer( units=units, hidden_size=hidden_size, num_heads=num_heads, activation_dropout=activation_dropout, dropout=dropout, attention_dropout=attention_dropout, layer_norm_eps=layernorm_eps, activation=activation, dtype=dtype, layout=layout, pre_norm=pre_norm, weight_initializer=weight_initializer, bias_initializer=bias_initializer)) def forward(self, data, mem_l, rel_positions, mask): """ Parameters ---------- F data - layout = 'NT': Shape (batch_size, query_length) - layout = 'TN': Shape (query_length, batch_size) mem_l Contains a list of memory objects, each one will contain: - layout = 'NT': Shape (batch_size, mem_length, C_i) - layout = 'TN': Shape (mem_length, batch_size, C_i) rel_positions The relative positions. Shape (query_length, mem_length + query_length) mask Mask between the query and the memory + query. Shape (batch_size, query_length, mem_length + query_length) Returns ------- out_l Contains a list of hidden states, each will contain: - layout = 'NT' Shape (batch_size, query_length, C_o) - layout = 'TN' Shape (query_length, batch_size, C_o) """ query_k_bias = self.query_k_bias.data() query_r_bias = self.query_r_bias.data() out_l = [] out = data for i, layer in enumerate(self.decoder_layers): out = layer(out, mem_l[i], rel_positions, mask, query_r_bias, query_k_bias) out_l.append(out) return out_l