def fuse_conv_and_bn(conv, bn): # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ fusedconv = nn.Conv2d(conv.in_channels, conv.out_channels, kernel_size=conv.kernel_size, stride=conv.stride, padding=conv.padding, groups=conv.groups, bias=True) # prepare filters w_conv = conv.weight.clone().view(conv.out_channels, -1) w_bn = jt.diag(bn.weight / (jt.sqrt(bn.eps + bn.running_var))) fusedconv.weight.assign( jt.matmul(w_bn, w_conv).view(fusedconv.weight.shape)) # prepare spatial bias b_conv = jt.zeros( (conv.weight.shape[0], )) if conv.bias is None else conv.bias b_bn = bn.bias - bn.weight * bn.running_mean / jt.sqrt(bn.running_var + bn.eps) fusedconv.bias.assign( jt.matmul(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) return fusedconv
def execute(self, x): if len(x.shape) == 3: dims = [0, 2] else: dims = [0] if self.is_train: xmean = jt.mean(x, dims=dims, keepdims=1) x2mean = jt.mean(x * x, dims=dims, keepdims=1) if self.sync and jt.in_mpi: xmean = xmean.mpi_all_reduce("mean") x2mean = x2mean.mpi_all_reduce("mean") xvar = x2mean - xmean * xmean norm_x = (x - xmean) / jt.sqrt(xvar + self.eps) self.running_mean.update(self.running_mean + (xmean.sum(dims) - self.running_mean) * self.momentum) self.running_var.update(self.running_var + (xvar.sum(dims) - self.running_var) * self.momentum) else: running_mean = self.running_mean.broadcast(x, dims) running_var = self.running_var.broadcast(x, dims) norm_x = (x - running_mean) / jt.sqrt(running_var + self.eps) if not self.affine: return norm_x w = self.weight.broadcast(x, dims) b = self.bias.broadcast(x, dims) return norm_x * w + b
def execute(self, x): dims = [0] + list(range(2, x.ndim)) ####### centering calibration begin ####### $ x += self.center_weight * self.stas(x) ####### centering calibration end ####### $ if self.is_train: xmean = jt.mean(x, dims=dims) x2mean = jt.mean(x * x, dims=dims) if self.sync and jt.in_mpi: xmean = xmean.mpi_all_reduce("mean") x2mean = x2mean.mpi_all_reduce("mean") xvar = (x2mean - xmean * xmean).maximum(0.0) w = 1.0 / jt.sqrt(xvar + self.eps) b = -xmean * w norm_x = x * w.broadcast(x, dims) + b.broadcast(x, dims) self.running_mean.update(self.running_mean + (xmean.reshape( (-1, )) - self.running_mean) * self.momentum) self.running_var.update(self.running_var + (xvar.reshape((-1, )) - self.running_var) * self.momentum) else: w = 1.0 / jt.sqrt(self.running_var + self.eps) b = -self.running_mean * w norm_x = x * w.broadcast(x, dims) + b.broadcast(x, dims) ####### scaling calibration begin ####### $ scale_factor = jt.sigmoid(self.scale_weight * self.stas(norm_x) + self.scale_bias) ####### scaling calibration end ####### $ return self.weight * scale_factor * norm_x + self.bias
def step(self, loss): ps = self.parameters gs = jt.grad(loss, ps) self.adam_step += 1 n, (b0, b1) = float(self.adam_step), self.betas for p, g, v, m in zip(ps, gs, self.values, self.m): m.assign(b0 * m + (1-b0) * g) v.assign(b1 * v + (1-b1) * g * g) step_size = self.lr * jt.sqrt(1-b1**n) / (1-b0 ** n) p -= m * step_size / (jt.sqrt(v) + self.eps) p.detach_inplace() jt.sync(self.no_grad_parameters)
def step(self, loss): self.pre_step(loss) n = float(self.n_step) for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) b0, b1 = pg.get("betas", self.betas) for p, g, v, m in zip(pg["params"], pg["grads"], pg["values"], pg["m"]): m.assign(b0 * m + (1-b0) * g) v.assign(b1 * v + (1-b1) * g * g) step_size = lr * jt.sqrt(1-b1**n) / (1-b0 ** n) p -= m * step_size / (jt.sqrt(v) + eps) p.detach_inplace()
def adam(model, loss, lr=3e-4, betas=[0.9, 0.999], eps=1e-8): ps = jt.find_vars(model) gs = jt.grad(loss, ps) with jt.var_scope('_'.join([model, 'adam']), unique=True): adam_step = jt.make_var([1], init=jt.zeros) adam_step += 1 for p,g in zip(ps,gs): m = jt.make_var(p.shape, init=jt.zeros) v = jt.make_var(p.shape, init=jt.zeros) m.assign(betas[0] * m + (1-betas[0]) * g) v.assign(betas[1] * v + (1-betas[1]) * g * g) step_size = lr * jt.sqrt(1-betas[1]**adam_step) / (1-betas[0] ** adam_step) p -= m * step_size / (jt.sqrt(v) + eps)
def step(self, loss): self.pre_step(loss) n = float(self.n_step) for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) b0, b1 = pg.get("betas", self.betas) for p, g, v, m in zip(pg["params"], pg["grads"], pg["values"], pg["m"]): if p.is_stop_grad(): continue m.update(b0 * m + (1 - b0) * g) v.update(b1 * v + (1 - b1) * g * g) step_size = lr * jt.sqrt(1 - b1**n) / (1 - b0**n) p.update(p - m * step_size / (jt.sqrt(v) + eps))
def execute(self, x): if self.is_train: xmean = jt.mean(x, dims=[0,2,3], keepdims=1) x2mean = jt.mean(x*x, dims=[0,2,3], keepdims=1) xvar = x2mean-xmean*xmean norm_x = (x-xmean)/jt.sqrt(xvar+self.eps) self.running_mean += (xmean.sum([0,2,3])-self.running_mean)*self.momentum self.running_var += (xvar.sum([0,2,3])-self.running_var)*self.momentum else: running_mean = self.running_mean.broadcast(x, [0,2,3]) running_var = self.running_var.broadcast(x, [0,2,3]) norm_x = (x-running_mean)/jt.sqrt(running_var+self.eps) w = self.weight.broadcast(x, [0,2,3]) b = self.bias.broadcast(x, [0,2,3]) return norm_x * w + b
def change(gt, priors): """ Compute the d_change metric proposed in Box2Pix: https://lmb.informatik.uni-freiburg.de/Publications/2018/UB18/paper-box2pix.pdf Input should be in point form (xmin, ymin, xmax, ymax). Output is of shape [num_gt, num_priors] Note this returns -change so it can be a drop in replacement for """ num_priors = priors.shape[0] num_gt = gt.shape[0] gt_w = (gt[:, 2] - gt[:, 0]).unsqueeze(1).expand(num_gt, num_priors) gt_h = (gt[:, 3] - gt[:, 1]).unsqueeze(1).expand(num_gt, num_priors) gt_mat = gt.unsqueeze(1).expand(num_gt, num_priors, 4) pr_mat = priors.unsqueeze(0).expand(num_gt, num_priors, 4) diff = gt_mat - pr_mat diff[:, :, 0] /= gt_w diff[:, :, 2] /= gt_w diff[:, :, 1] /= gt_h diff[:, :, 3] /= gt_h return -jt.sqrt((diff.sqr()).sum(dim=2))
def check(self, h, w, cs, rs, pa, rtp, dim): a = jt.random([h, w]) a.data with jt.log_capture_scope( log_v=0, log_vprefix="tuner_manager=100", # this value is used for force compile compile_options={"test_reduce_tuner": 1}) as logs: amean = jt.mean(a, dims=[dim], keepdims=1) a2mean = jt.mean(a * a, dims=[dim], keepdims=1) norm_aa = (a - amean.broadcast_var(a)) / ( jt.sqrt(a2mean - amean * amean).broadcast_var(a)) norm_aa.data logs = find_log_with_re( logs, "Run tuner reduce: confidence\\((20)\\) candidates\\((.*)\\)$") assert len(logs) == 1, logs assert logs[0][0] == "20", "confidence of reorder should be 20" candidates = simple_parser(logs[0][1]) assert candidates == { "order0": [ 0, ], "order1": [ 1, ], "order2": [ 0, ], "split1": [ 2048, ], }
def pullaway_loss(embeddings): norm = jt.sqrt((embeddings ** 2).sum(1,keepdims=True)) normalized_emb = embeddings / norm similarity = jt.matmul(normalized_emb, normalized_emb.transpose(1, 0)) batch_size = embeddings.size(0) loss_pt = (jt.sum(similarity) - batch_size) / (batch_size * (batch_size - 1)) return loss_pt
def compute_gradient_penalty(D, real_samples, fake_samples): 'Calculates the gradient penalty loss for WGAN GP' alpha = jt.array(np.random.random((real_samples.shape[0], 1, 1, 1)).astype('float32')) interpolates = ((alpha * real_samples) + ((1 - alpha) * fake_samples)) d_interpolates = D(interpolates) gradients = jt.grad(d_interpolates, interpolates) gradients = gradients.reshape((gradients.shape[0], (- 1))) gp =((jt.sqrt((gradients.sqr()).sum(1))-1).sqr()).mean() return gp
def step(self, loss): self.adam_step += 1 ps = self.parameters gs = jt.grad(loss, ps) if jt.mpi: for g in gs: g.assign(g.mpi_all_reduce("mean")) if self.adam_step % self.param_sync_iter == 0: for p in ps: p.assign(p.mpi_all_reduce("mean")) n, (b0, b1) = float(self.adam_step), self.betas for p, g, v, m in zip(ps, gs, self.values, self.m): m.assign(b0 * m + (1 - b0) * g) v.assign(b1 * v + (1 - b1) * g * g) step_size = self.lr * jt.sqrt(1 - b1**n) / (1 - b0**n) p -= m * step_size / (jt.sqrt(v) + self.eps) p.detach_inplace() jt.sync(self.no_grad_parameters)
def execute(self, x): dims = [-i for i in range(len(self.normalized_shape), 0, -1)] mean = jt.mean(x, dims=dims, keepdims=1) numerator = x - mean variance = jt.mean(numerator.sqr(), dims=dims, keepdims=1) denominator = jt.sqrt(variance + self.eps) norm_x = numerator / denominator if self.elementwise_affine: norm_x = norm_x * self.weight + self.bias return norm_x
def batch_norm(x): xmean = jt.mean(x, dims=[0, 2, 3], keepdims=1) x2mean = jt.mean(x * x, dims=[0, 2, 3], keepdims=1) norm_x = (x - xmean.broadcast_var(x)) / ( jt.sqrt(x2mean - xmean * xmean + jt.float32(1e-5)).broadcast_var(x)) w = jt.make_var([x.shape[1]], init=get_init_var) b = jt.make_var([x.shape[1]], init=get_init_var) w = w.broadcast([1, w.shape[0], 1, 1], [0, 2, 3]) b = b.broadcast([1, b.shape[0], 1, 1], [0, 2, 3]) return norm_x * w + b
def step(self, loss): self.pre_step(loss) for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) alpha = pg.get("alpha", self.alpha) for p, g, v in zip(pg["params"], pg["grads"], pg["values"]): if p.is_stop_grad(): continue v.update(alpha * v + (1 - alpha) * g * g) p.update(p - lr * g / (jt.sqrt(v) + eps))
def calc_gradient_penalty(netD, real_data, generated_data): LAMBDA = 10 b_size = real_data.shape[0] alpha = jt.random([b_size, 1, 1, 1]) alpha = alpha.broadcast(real_data) interpolated = ((alpha * real_data.data) + ((1 - alpha) * generated_data.data)) prob_interpolated = netD(interpolated) gradients = jt.grad(prob_interpolated, interpolated) gradients = jt.reshape(gradients, [b_size, -1]) gradients_norm = jt.sqrt((jt.sum((gradients**2), dim=1) + 1e-12)) return (LAMBDA * ((gradients_norm - 1)**2).mean())
def execute(self, x): xmean = jt.mean(x, dims=[2, 3], keepdims=1) x2mean = jt.mean(x * x, dims=[2, 3], keepdims=1) if self.sync and jt.in_mpi: xmean = xmean.mpi_all_reduce("mean") x2mean = x2mean.mpi_all_reduce("mean") xvar = jt.maximum(x2mean - xmean * xmean, 0) norm_x = (x - xmean) / jt.sqrt(xvar + self.eps) w = self.weight.broadcast(x, [0, 2, 3]) b = self.bias.broadcast(x, [0, 2, 3]) return norm_x * w + b
def execute(self, x): N, C, H, W = x.shape assert C == self.num_channels assert C % self.num_groups == 0 x = x.reshape((N, self.num_groups, int(C / self.num_groups), H * W)) xmean = jt.mean(x, dims=[2, 3], keepdims=1) x2mean = jt.mean(x * x, dims=[2, 3], keepdims=1) xvar = jt.maximum(x2mean - xmean * xmean, 0) norm_x = (x - xmean) / jt.sqrt(xvar + self.eps) w = self.weight.reshape((1, self.num_groups, C // self.num_groups, 1)) b = self.bias.reshape((1, self.num_groups, C // self.num_groups, 1)) return (norm_x * w + b).reshape((N, C, H, W))
def execute(self, x): if self.is_train: xmean = jt.mean(x, dims=[0,2,3], keepdims=1) x2mean = jt.mean(x*x, dims=[0,2,3], keepdims=1) if self.sync and jt.in_mpi: xmean = xmean.mpi_all_reduce("mean") x2mean = x2mean.mpi_all_reduce("mean") xvar = x2mean-xmean*xmean norm_x = (x-xmean)/jt.sqrt(xvar+self.eps) self.running_mean.update(self.running_mean + (xmean.reshape((-1,)) - self.running_mean) * self.momentum) self.running_var.update(self.running_var + (xvar.reshape((-1,))-self.running_var)*self.momentum) else: running_mean = self.running_mean.broadcast(x, [0,2,3]) running_var = self.running_var.broadcast(x, [0,2,3]) norm_x = (x-running_mean)/jt.sqrt(running_var+self.eps) w = self.weight.broadcast(x, [0,2,3]) b = self.bias.broadcast(x, [0,2,3]) return norm_x * w + b
def __call__(self, boxlists): """ Arguments: boxlists (list[BoxList]) """ # Compute level ids s = jt.sqrt(cat([boxlist.area() for boxlist in boxlists])) # Eqn.(1) in FPN paper target_lvls = jt.floor(self.lvl0 + jt.log2(s / self.s0 + self.eps)) target_lvls = jt.clamp(target_lvls, min_v=self.k_min, max_v=self.k_max) return target_lvls.int32() - self.k_min
def batch_norm(x, is_train, eps=1e-5, momentum=0.1): w = jt.make_var([x.shape[1]], init=lambda *a: init.constant(*a, 1.0)) b = jt.make_var([x.shape[1]], init=lambda *a: init.constant(*a, 0.0)) running_mean = jt.make_var([x.shape[1]], init=lambda *a: init.constant(*a, 0.0)) running_var = jt.make_var([x.shape[1]], init=lambda *a: init.constant(*a, 1.0)) w = w.broadcast(x, [0,2,3]) b = b.broadcast(x, [0,2,3]) if is_train: xmean = jt.mean(x, dims=[0,2,3], keepdims=1) x2mean = jt.mean(x*x, dims=[0,2,3], keepdims=1) xvar = x2mean-xmean*xmean norm_x = (x-xmean)/jt.sqrt(xvar+eps) running_mean += (xmean.sum([0,2,3])-running_mean)*momentum running_var += (xvar.sum([0,2,3])-running_var)*momentum else: running_mean = running_mean.broadcast(x, [0,2,3]) running_var = running_var.broadcast(x, [0,2,3]) norm_x = (x-running_mean)/jt.sqrt(running_var+eps) return norm_x * w + b
def step(self, loss=None): if loss is not None: self.pre_step(loss) n = float(self.n_step) for pg in self.param_groups: # get arguments from each param_groups lr = pg.get("lr", self.lr) eps = pg.get("eps", self.eps) weight_decay = pg.get("weight_decay", self.weight_decay) b0, b1 = pg.get("betas", self.betas) for p, g, v, m in zip(pg["params"], pg["grads"], pg["values"], pg["m"]): if p.is_stop_grad(): continue p.update(p * (1 - lr * weight_decay)) bias_correction1 = 1 - b0**n bias_correction2 = 1 - b1**n m.update(b0 * m + (1 - b0) * g) #exp_avg v.update(b1 * v + (1 - b1) * g * g) #exp_avg_sq denom = jt.sqrt(v) / jt.sqrt(bias_correction2) + eps step_size = lr / bias_correction1 p.update(p - step_size * m / denom) self.zero_grad()
def projection(vertices, K, R, t, dist_coeffs, orig_size, eps=1e-9): ''' Calculate projective transformation of vertices given a projection matrix Input parameters: K: batch_size * 3 * 3 intrinsic camera matrix R, t: batch_size * 3 * 3, batch_size * 1 * 3 extrinsic calibration parameters dist_coeffs: vector of distortion coefficients orig_size: original size of image captured by the camera Returns: For each point [X,Y,Z] in world coordinates [u,v,z] where u,v are the coordinates of the projection in pixels and z is the depth ''' # instead of P*x we compute x'*P' vertices = jt.matmul(vertices, R.transpose((0, 2, 1))[0]) + t x, y, z = vertices[:, :, 0], vertices[:, :, 1], vertices[:, :, 2] x_ = x / (z + eps) y_ = y / (z + eps) # Get distortion coefficients from vector k1 = dist_coeffs[:, 0].unsqueeze(1) k2 = dist_coeffs[:, 1].unsqueeze(1) p1 = dist_coeffs[:, 2].unsqueeze(1) p2 = dist_coeffs[:, 3].unsqueeze(1) k3 = dist_coeffs[:, 4].unsqueeze(1) # we use x_ for x' and x__ for x'' etc. x_2 = x_.sqr() y_2 = y_.sqr() r = jt.sqrt(x_2 + y_2) r2 = r.sqr() r4 = r2.sqr() r6 = r4 * r2 tmp = k1 * (r2) + k2 * (r4) + k3 * (r6) + 1 x__ = x_ * tmp + 2 * p1 * x_ * y_ + p2 * (r2 + 2 * x_2) y__ = y_ * tmp + p1 * (r2 + 2 * y_2) + 2 * p2 * x_ * y_ vertices = jt.stack([x__, y__, jt.ones(z.shape)], dim=-1) vertices = jt.matmul(vertices, K.transpose((0, 2, 1))[0]) u, v = vertices[:, :, 0], vertices[:, :, 1] v = orig_size - v # map u,v from [0, img_size] to [-1, 1] to use by the renderer u = 2 * (u - orig_size / 2.) / orig_size v = 2 * (v - orig_size / 2.) / orig_size vertices = jt.stack([u, v, z], dim=-1) return vertices
def execute(self, x): N = x.shape[0] C = self.num_channels output_shape = (N, -1) # TODO: 3d group norm if x.ndim == 4: output_shape = x.shape assert C % self.num_groups == 0 x = x.reshape((N, self.num_groups, int(C / self.num_groups), -1)) xmean = jt.mean(x, dims=[2, 3], keepdims=1) x2mean = jt.mean(x * x, dims=[2, 3], keepdims=1) xvar = jt.maximum(x2mean - xmean * xmean, 0) norm_x = (x - xmean) / jt.sqrt(xvar + self.eps) if not self.affine: return norm_x.reshape(output_shape) w = self.weight.reshape((1, self.num_groups, C // self.num_groups, 1)) b = self.bias.reshape((1, self.num_groups, C // self.num_groups, 1)) return (norm_x * w + b).reshape(output_shape)
def check(self, h, w, cs, rs, pa, rtp, dim): a = jt.random([h, w]) a.sync() with jt.log_capture_scope( log_v=0, log_vprefix="tuner_manager=100", # this value is used for force compile compile_options={"test_new_fused_op": 1}) as logs: amean = jt.mean(a, dims=[dim], keepdims=1) a2mean = jt.mean(a * a, dims=[dim], keepdims=1) norm_aa = (a - amean.broadcast_var(a)) / ( jt.sqrt(a2mean - amean * amean).broadcast_var(a)) norm_aa.sync() logs = find_log_with_re( logs, "Run tuner reduce: confidence\\((.*)\\) candidates\\((.*)\\)$") assert len(logs) == 3, logs
def execute(self, input, step=0, alpha=-1): for i in range(step, -1, -1): index = self.n_layer - i - 1 if i == step: out = self.from_rgb[index](input) if i == 0: out_std = jt.sqrt(out.var + 1e-8) mean_std = out_std.mean() mean_std = mean_std.expand(out.size(0), 1, 4, 4) out = jt.cat([out, mean_std], 1) out = self.progression[index](out) if i > 0: if i == step and 0 <= alpha < 1: skip_rgb = nn.pool(input, 2) skip_rgb = self.from_rgb[index + 1](skip_rgb) out = (1 - alpha) * skip_rgb + alpha * out out = out.squeeze(2).squeeze(2) out = self.linear(out) return out
def hypot(a,b): return jt.sqrt(a.sqr()+b.sqr())
def compute_centerness_targets(self, reg_targets): left_right = reg_targets[:, [0, 2]] top_bottom = reg_targets[:, [1, 3]] centerness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \ (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) return jt.sqrt(centerness)
def norm(x, k, dim): assert k == 2 or k == 1 if k == 1: return x.abs().sum(dim) if k == 2: return jt.sqrt((x.sqr()).sum(dim).maximum(1e-6))