def celu_backward(inputs, alpha=1.0, axis=1): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] fstart, fstop, fstep = create_slice(dy.shape, axis, True) bstart, bstop, bstep = create_slice(dy.shape, axis, False) dy0 = F.slice(dy, fstart, fstop, fstep) dy1 = F.slice(dy, bstart, bstop, bstep) aep = alpha * F.exp(x0) aen = alpha * F.exp(-x0) m0 = F.greater_scalar(x0, 0) m1 = 1 - m0 m0 = no_grad(m0) m1 = no_grad(m1) dx00 = dy0 * (m0 + aep * m1) dx01 = dy1 * (m1 + aen * m0) dx = dx00 - dx01 return dx
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Inputs x0 = inputs[0].data dy = inputs[1].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_dy = inputs[1].grad # Grads of outputs g_dx0 = outputs[0].grad # Computation if prop_down[0]: if accum[0]: g_x0 += g_dx0 * dx0 else: g_x0.copy_from(g_dx0 * dx0) if prop_down[1]: if accum[1]: g_dy += g_dx0 * F.exp(x0) else: g_dy.copy_from(g_dx0 * F.exp(x0))
def sigmas_learned_coef(ctx, log_var0, log_var1): v0 = F.exp(log_var0) v1 = F.exp(log_var1) c0 = F.minimum_scalar(v0, 1.) c1 = F.minimum_scalar(v1, 1.) c = c1 / c0 return c
def sigmas_regularization(ctx, log_var0, log_var1): with nn.context_scope(ctx): h0 = F.exp(log_var0) h0 = F.pow_scalar(h0, 0.5) h1 = F.exp(log_var1) h1 = F.pow_scalar(h1, 0.5) r = F.mean(F.squared_error(h0, h1)) return r
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss_sr = F.mean(squared_error * (1 / s0 + 1 / s1) + (s0 / s1 + s1 / s0)) * 0.5 return loss_sr
def q_function(obs, num_actions, min_v, max_v, num_bins, scope): with nn.parameter_scope(scope): out = nature_head(obs) out = PF.affine(out, num_actions * num_bins, name='output') out = F.reshape(out, (-1, num_actions, num_bins)) probs = F.exp(out) / F.sum(F.exp(out), axis=2, keepdims=True) dists = F.arange(0, num_bins) * (max_v - min_v) / (num_bins - 1) + min_v values = F.sum(probs * F.reshape(dists, (1, 1, num_bins)), axis=2) return values, probs, F.reshape(dists, (-1, 1))
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): var0 = F.exp(log_var0) var1 = F.exp(log_var1) s0 = F.pow_scalar(var0, 0.5) s1 = F.pow_scalar(var0, 0.5) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss = F.log(s1/s0) + (var0/var1 + squared_error/var1) * 0.5 loss_sr = F.mean(loss) return loss_sr
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_v0, log_v1, log_s0, log_s1): v0 = F.exp(log_v0) v1 = F.exp(log_v1) squared_error = F.squared_error(pred0, pred1) s0 = F.exp(log_s0) s1 = F.exp(log_s1) with nn.context_scope(ctx): error = squared_error * (1 / v0 + 1 / v1) + (v0 / v1 + v1 / v0) + (s0 / s1 + s1 / s0) loss_sr = F.mean(error) * 0.5 return loss_sr
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_v0, log_v1, log_s0, log_s1): v0 = F.exp(log_v0) v1 = F.exp(log_v1) squared_error = F.squared_error(pred0, pred1) s0 = F.exp(log_s0) s1 = F.exp(log_s1) with nn.context_scope(ctx): error = squared_error * (1 / v0 + 1 / v1) + (v0 / v1 + v1 / v0) + ( s0 / s1 + s1 / s0) loss_sr = F.mean(error) * 0.5 return loss_sr
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) h = F.mean(h, axis=1) r = F.mean(F.squared_error(h, one)) return r
def sigmas_coef(ctx, log_var0, log_var1): v0 = F.exp(log_var0) v1 = F.exp(log_var1) v0_g = F.greater_scalar(v0, 1.) v0_l = F.logical_not(v0_g) v1_g = F.greater_scalar(v1, 1.) v1_l = F.logical_not(v1_g) v0_g_and_v1_g = F.logical_and(v0_g, v1_g) v0_g_and_v1_l = F.logical_and(v0_g, v1_l) v0_l_and_v1_g = F.logical_and(v0_l, v1_g) v0_l_and_v1_l = F.logical_and(v0_l, v1_l) c = v0_g_and_v1_g \ + v0_g_and_v1_l * v1 \ + v0_l_and_v1_g / v0 \ + v0_l_and_v1_l * v1 / v0 return c
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var): #TODO: squared error/absolute error with nn.context_scope(ctx): loss_sr = F.mean(F.squared_error( F.softmax(pred0), F.softmax(pred1)) * F.exp(-log_var)) \ + F.mean(log_var) return loss_sr
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) b = log_var.shape[0] r = F.sum(F.squared_error(h, one)) / b return r
def yolov2_activate(x, anchors, biases): shape = x.shape y = F.reshape(x, ( shape[0], anchors, -1, ) + shape[2:]) stop = list(y.shape) stop[2] = 2 t_xy = F.slice(y, (0, 0, 0, 0, 0), stop) stop[2] = 4 t_wh = F.slice(y, (0, 0, 2, 0, 0), stop) stop[2] = 5 t_o = F.slice(y, (0, 0, 4, 0, 0), stop) stop[2] = y.shape[2] t_p = F.slice(y, (0, 0, 5, 0, 0), stop) t_xy = F.sigmoid(t_xy) t_wh = F.exp(t_wh) t_o = F.sigmoid(t_o) t_p = F.softmax(t_p, axis=2) t_x, t_y, t_wh = yolov2_image_coordinate(t_xy, t_wh, biases) y = F.concatenate(t_x, t_y, t_wh, t_o, t_p, axis=2) y = F.transpose(y, (0, 1, 3, 4, 2)).reshape( (shape[0], -1, shape[1] / anchors)) return y
def ce_loss_with_uncertainty(ctx, pred, y_l, log_var): r = F.randn(0., 1., log_var.shape) r = F.pow_scalar(F.exp(log_var), 0.5) * r h = pred + r with nn.context_scope(ctx): loss_ce = F.mean(F.softmax_cross_entropy(h, y_l)) return loss_ce
def logits(image, text): image_features = encode_image(image) text_features = encode_text(text) # normalized features image_features = image_features / \ F.norm(image_features, axis=1, keepdims=True) text_features = text_features / \ F.norm(text_features, axis=1, keepdims=True) # cosine similarity as logits logit_scale = nn.parameter.get_parameter_or_create(name='logit_scale', shape=()) logit_scale = F.exp(logit_scale) image_features = image_features.reshape( (1, image_features.shape[0], image_features.shape[1])) text_features = F.transpose(text_features, (1, 0)) text_features = text_features.reshape( (1, text_features.shape[0], text_features.shape[1])) per_image = F.batch_matmul(image_features, text_features).reshape( (image_features.shape[0], -1)) logits_per_image = logit_scale.reshape((1, 1)) * per_image logits_per_text = F.transpose(logits_per_image, (1, 0)) # shape = [global_batch_size, global_batch_size] return logits_per_image, logits_per_text
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def sr_loss_with_uncertainty_and_coef(ctx, pred0, pred1, log_var0, log_var1): c0 = srwu_learned_coef(ctx, log_var0) c1 = srwu_learned_coef(ctx, log_var1) sc0 = sigmas_learned_coef(ctx, log_var0, log_var1) sc1 = sigmas_learned_coef(ctx, log_var1, log_var0) c0.need_grad = False c1.need_grad = False sc0.need_grad = False sc1.need_grad = False #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss_sr = F.mean( squared_error * (c0 / s0 + c1 / s1) + (sc0 * s0 / s1 + sc1 * s1 / s0)) * 0.5 return loss_sr
def policy_network(obs, action_size, name): with nn.parameter_scope(name): out = PF.affine(obs, 256, name='fc1') out = F.relu(out) out = PF.affine(out, 256, name='fc2') out = F.relu(out) mean = PF.affine(out, action_size, name='mean') logstd = PF.affine(out, action_size, name='logstd') clipped_logstd = F.clip_by_value(logstd, -20, 2) return Normal(mean, F.exp(clipped_logstd))
def conv_block(self, x): out = PF.convolution(x_a, self.filter_size, (3,3), pad=(1,1)) out = F.relu(out, inplace=True) out = PF.convolution(out, self.filter_size, (1,1)) out = F.relu(out, inplace=True) out = F.pad out = PF.convolution(out, self.in_channel, (3,3), pad=(0,0)) out = out*F.exp(self.scale*3) return out
def kl_loss(self, mu, logvar): r"""Returns the Kullback-Leibler divergence loss with a standard Gaussian. Args: mu (nn.Variable): Mean of the distribution of shape (B, D, 1). logvar (nn.Variable): Log variance of the distribution of shape (B, D, 1). Returns: nn.Variable: Kullback-Leibler divergence loss. """ return 0.5 * F.mean(F.sum(F.exp(logvar) + mu**2 - 1. - logvar, axis=1))
def softplus_backward(inputs): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] e0 = F.exp(x0) dx0 = dy * e0 / (1 + e0) return dx0
def sample(self, mu, logvar): r"""Samples from a Gaussian distribution. Args: mu (nn.Variable): Mean of the distribution of shape (B, D, 1). logvar (nn.Variable): Log variance of the distribution of shape (B, D, 1). Returns: nn.Variable: A sample. """ if self.training: eps = F.randn(shape=mu.shape) return mu + F.exp(0.5 * logvar) * eps return mu
def pointer_net(query_embed, query_embed_mask, decoder_states, hidden_dim): """ query_embed: (batch_size, max_query_length, E1) decoder_states: (batch_size, max_action_length, E2) """ with nn.parameter_scope("pointer_net"): batch_size, max_query_length, _ = query_embed.shape _, max_action_length, _ = decoder_states.shape with nn.parameter_scope("layer1_input"): query_embed_trans = dense(query_embed, hidden_dim, base_axis=2, activation=lambda x: x) with nn.parameter_scope("layer1_h"): h_trans = dense(decoder_states, hidden_dim, base_axis=2, activation=lambda x: x) query_embed_trans = F.reshape( query_embed_trans, (batch_size, 1, max_query_length, hidden_dim)) query_embed_trans = F.broadcast( query_embed_trans, (batch_size, max_action_length, max_query_length, hidden_dim)) h_trans = F.reshape(h_trans, (batch_size, max_action_length, 1, hidden_dim)) h_trans = F.broadcast( h_trans, (batch_size, max_action_length, max_query_length, hidden_dim)) dense1_trans = F.tanh(query_embed_trans + h_trans) with nn.parameter_scope("layer2"): # scores: (batch_size, max_action_length, max_query_length, 1) scores = dense(dense1_trans, 1, base_axis=3, activation=lambda x: x) # scores: (batch_size, max_action_length, max_query_length) scores = F.reshape(scores, (batch_size, max_action_length, max_query_length)) scores = F.exp(scores - F.max(scores, axis=2, keepdims=True)) mask = F.reshape(query_embed_mask, (batch_size, 1, max_query_length)) mask = F.broadcast(mask, (batch_size, max_action_length, max_query_length)) scores = scores * mask scores = scores / F.sum(scores, axis=2, keepdims=True) return scores
def log_softmax_backward(inputs, axis=None): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] y0 = get_output(x0, "LogSoftmax") D = len(x0.shape) axis = positive_axis(axis, D) dx0 = dy - F.exp(y0) * F.sum(dy, axis=axis, keepdims=True) return dx0
def gaussian_log_likelihood(x, mean, logstd, orig_max_val=255): """ Compute the log-likelihood of a Gaussian distribution for given data `x`. Args: x (nn.Variable): Target data. It is assumed that the values are ranged [-1, 1], which are originally [0, orig_max_val]. means (nn.Variable): Gaussian mean. Must be the same shape as x. logstd (nn.Variable): Gaussian log standard deviation. Must be the same shape as x. orig_max_val (int): The maximum value that x originally has before being rescaled. Return: A log probabilies of x in nats. """ assert x.shape == mean.shape == logstd.shape centered_x = x - mean inv_std = F.exp(-logstd) half_bin = 1.0 / orig_max_val def clamp(val): # Here we don't need to clip max return F.clip_by_value(val, min=1e-12, max=1e8) # x + 0.5 (in original scale) plus_in = inv_std * (centered_x + half_bin) cdf_plus = approx_standard_normal_cdf(plus_in) log_cdf_plus = F.log(clamp(cdf_plus)) # x - 0.5 (in original scale) minus_in = inv_std * (centered_x - half_bin) cdf_minus = approx_standard_normal_cdf(minus_in) log_one_minus_cdf_minus = F.log(clamp(1.0 - cdf_minus)) log_cdf_delta = F.log(clamp(cdf_plus - cdf_minus)) log_probs = F.where( F.less_scalar(x, -0.999), log_cdf_plus, # Edge case for 0. It uses cdf for -inf as cdf_minus. F.where(F.greater_scalar(x, 0.999), # Edge case for orig_max_val. It uses cdf for +inf as cdf_plus. log_one_minus_cdf_minus, log_cdf_delta # otherwise ) ) assert log_probs.shape == x.shape return log_probs
def elu_backward(inputs, alpha=1.0): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] m0 = F.greater_scalar(x0, 0) m1 = 1 - m0 m0 = no_grad(m0) m1 = no_grad(m1) dx = dy * (m0 + alpha * F.exp(x0) * m1) return dx
def sinusoidal_embedding(timesteps, embedding_dim): """ Sinusoidal embeddings originally proposed in "Attention Is All You Need" (https://arxiv.org/abs/1706.03762). """ assert len(timesteps.shape) == 1 half_dim = embedding_dim // 2 denominator = -np.log(10000) / half_dim emb = F.exp(denominator * F.arange(start=0, stop=half_dim)) emb = F.reshape(timesteps, (-1, 1)) * F.reshape(emb, (1, -1)) emb = F.concatenate(F.cos(emb), F.sin(emb), axis=1) if embedding_dim & 1: # zero pad to be divisible by two emb = F.pad(emb, [[0, 0], [0, 1]]) assert emb.shape == (timesteps.shape[0], embedding_dim) return emb
def __call__(self, x, return_encoding_indices=False): x = F.transpose(x, (0, 2, 3, 1)) x_flat = x.reshape((-1, self.embedding_dim)) x_flat_squared = F.broadcast(F.sum(x_flat**2, axis=1, keepdims=True), (x_flat.shape[0], self.num_embedding)) emb_wt_squared = F.transpose( F.sum(self.embedding_weight**2, axis=1, keepdims=True), (1, 0)) distances = x_flat_squared + emb_wt_squared - 2 * \ F.affine(x_flat, F.transpose(self.embedding_weight, (1, 0))) encoding_indices = F.min(distances, only_index=True, axis=1, keepdims=True) encoding_indices.need_grad = False quantized = F.embed( encoding_indices.reshape(encoding_indices.shape[:-1]), self.embedding_weight).reshape(x.shape) if return_encoding_indices: return encoding_indices, F.transpose(quantized, (0, 3, 1, 2)) encodings = F.one_hot(encoding_indices, (self.num_embedding, )) e_latent_loss = F.mean( F.squared_error(quantized.get_unlinked_variable(need_grad=False), x)) q_latent_loss = F.mean( F.squared_error(quantized, x.get_unlinked_variable(need_grad=False))) loss = q_latent_loss + self.commitment_cost * e_latent_loss quantized = x + (quantized - x).get_unlinked_variable(need_grad=False) avg_probs = F.mean(encodings, axis=0) perplexity = F.exp(-F.sum(avg_probs * F.log(avg_probs + 1.0e-10))) return loss, F.transpose(quantized, (0, 3, 1, 2)), perplexity, encodings
def kp2gaussian(kp, spatial_size, kp_variance): mean = kp['value'] coordinate_grid = make_coordinate_grid(spatial_size) number_of_leading_dimensions = len(mean.shape) - 1 shape = (1, ) * number_of_leading_dimensions + coordinate_grid.shape coordinate_grid = F.reshape(coordinate_grid, shape) coordinate_grid = F.broadcast( coordinate_grid, mean.shape[:number_of_leading_dimensions] + coordinate_grid.shape[number_of_leading_dimensions:]) # Preprocess kp shape shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 2) mean = F.reshape(mean, shape, inplace=False) mean_sub = coordinate_grid - mean out = F.exp(-0.5 * F.sum( (mean_sub**2), axis=mean_sub.ndim - 1) / kp_variance) return out
def position_encoding(x: nn.Variable) -> nn.Variable: batch_size, sequence_length, dim = x.shape position = F.reshape(F.arange(0, sequence_length), shape=(sequence_length, 1)) # -> (sequence_length, 1) div_term = F.exp(F.arange(0, dim, 2) * -(np.log(10000.0) / dim)) # -> (dim//2, ) sin_val = F.sin(position * F.reshape(div_term, shape=(1, dim // 2))) # -> (sequence_length, dim//2) cos_val = F.cos(position * F.reshape(div_term, shape=(1, dim // 2))) # -> (sequence_length, dim//2) ret = [] for i in range(dim): if i % 2 == 0: ret.append(sin_val[:, i // 2:i // 2 + 1]) else: ret.append(cos_val[:, i // 2:i // 2 + 1]) pe = F.reshape(F.concatenate(*ret, axis=1), shape=(1, sequence_length, dim)) return x + F.broadcast(pe, shape=x.shape)
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) r = F.mean(F.abs(h - one)) return r
def srwu_coef(ctx, log_var): v = F.exp(log_var) v0_g = F.greater_scalar(v, 1.) v0_l = F.logical_not(v0_g) c = v0_g + v * v0_l return c
def srwu_learned_coef(ctx, log_var): v = F.exp(log_var) c = F.minimum_scalar(v, 1.) return c