def pairwise_distance(x1, x2, p=2, eps=1e-6): r""" Computes the batchwise pairwise distance between vectors v1,v2: .. math :: \Vert x \Vert _p := \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p} Args: x1: first input tensor x2: second input tensor p: the norm degree. Default: 2 eps (float, optional): Small value to avoid division by zero. Default: 1e-6 Shape: - Input: :math:`(N, D)` where `D = vector dimension` - Output: :math:`(N, 1)` Example:: >>> input1 = autograd.Variable(torch.randn(100, 128)) >>> input2 = autograd.Variable(torch.randn(100, 128)) >>> output = F.pairwise_distance(input1, input2, p=2) >>> output.backward() """ assert x1.size() == x2.size(), "Input sizes must be equal." assert x1.dim() == 2, "Input must be a 2D matrix." diff = torch.abs(x1 - x2) out = torch.pow(diff + eps, p).sum(dim=1, keepdim=True) return torch.pow(out, 1. / p)
def forward(self, model_output, target, mask, attr): pred_seq, pred_attr = model_output # input (from model.forward()) (batch_size, max_seq_len, vocab_size) # target (from dataloader->labels) (batch_size, max_seq_len) # mask (from dataloader->masks) (batch_size, max_seq_len) if not self.seen: print('> in LanguageModelCriterion.forward(input, target, mask):') print(' pred_seq', pred_seq.shape) # (200, 17, 3562) print(' pred_attr', pred_attr.shape) # (200, 1000) print(' target', target.shape) # (200, 17) print(' mask', mask.shape) # (200, 17) print(' attr', attr.shape) # (200, 1000) self.seen = True # truncate to the same size target = target[:, :pred_seq.size(1)] mask = mask[:, :pred_seq.size(1)] pred_seq = to_contiguous(pred_seq).view(-1, pred_seq.size(2)) target = to_contiguous(target).view(-1, 1) mask = to_contiguous(mask).view(-1, 1) output = - pred_seq.gather(1, target) * mask output = torch.sum(output) / torch.sum(mask) bsize = pred_attr.size(0) pred_attr = to_contiguous(pred_attr) attr = to_contiguous(attr.float()) attr_loss = torch.pow(torch.sum(torch.pow((pred_attr - attr), 2)), 0.5) / bsize output = output + self.attr_weight * attr_loss return output
def model(): mu_latent = pyro.sample("mu_latent", dist.normal, self.mu0, torch.pow(self.tau0, -0.5)) sigma = torch.pow(self.tau, -0.5) pyro.observe("obs0", dist.lognormal, self.data[0], mu_latent, sigma) pyro.observe("obs1", dist.lognormal, self.data[1], mu_latent, sigma) return mu_latent
def forward(self, x, labels): """ Args: - x: feature matrix with shape (batch_size, feat_dim). - labels: ground truth labels with shape (num_classes). """ batch_size = x.size(0) distmat = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(batch_size, self.num_classes) + \ torch.pow(self.centers, 2).sum(dim=1, keepdim=True).expand(self.num_classes, batch_size).t() distmat.addmm_(1, -2, x, self.centers.t()) classes = torch.arange(self.num_classes).long() if self.use_gpu: classes = classes.cuda() labels = labels.unsqueeze(1).expand(batch_size, self.num_classes) mask = labels.eq(classes.expand(batch_size, self.num_classes)) dist = [] for i in range(batch_size): value = distmat[i][mask[i]] value = value.clamp(min=1e-12, max=1e+12) # for numerical stability dist.append(value) dist = torch.cat(dist) loss = dist.mean() return loss
def model(): mu_latent = pyro.sample("mu_latent", dist.normal, self.mu0, torch.pow(self.tau0, -0.5)) bijector = AffineExp(torch.pow(self.tau, -0.5), mu_latent) x_dist = TransformedDistribution(dist.normal, bijector) pyro.observe("obs0", x_dist, self.data[0], ng_zeros(1), ng_ones(1)) pyro.observe("obs1", x_dist, self.data[1], ng_zeros(1), ng_ones(1)) return mu_latent
def model(): mu_latent = pyro.sample( "mu_latent", dist.Normal(self.mu0, torch.pow(self.lam0, -0.5), reparameterized=reparameterized)) for i, x in enumerate(self.data): pyro.observe("obs_%d" % i, dist.normal, x, mu_latent, torch.pow(self.lam, -0.5)) return mu_latent
def updateOutput(self, input): assert input.dim() == 4 if self.scale is None: self.scale = input.new() if input.type() == 'torch.cuda.FloatTensor': self._backend.SpatialCrossMapLRN_updateOutput( self._backend.library_state, input, self.output, self.scale, self.size, self.alpha, self.beta, self.k ) else: batchSize = input.size(0) channels = input.size(1) inputHeight = input.size(2) inputWidth = input.size(3) self.output.resize_as_(input) self.scale.resize_as_(input) # use output storage as temporary buffer inputSquare = self.output torch.pow(input, 2, out=inputSquare) prePad = int((self.size - 1) / 2 + 1) prePadCrop = channels if prePad > channels else prePad scaleFirst = self.scale.select(1, 0) scaleFirst.zero_() # compute first feature map normalization for c in range(prePadCrop): scaleFirst.add_(inputSquare.select(1, c)) # reuse computations for next feature maps normalization # by adding the next feature map and removing the previous for c in range(1, channels): scalePrevious = self.scale.select(1, c - 1) scaleCurrent = self.scale.select(1, c) scaleCurrent.copy_(scalePrevious) if c < channels - prePad + 1: squareNext = inputSquare.select(1, c + prePad - 1) scaleCurrent.add_(1, squareNext) if c > prePad: squarePrevious = inputSquare.select(1, c - prePad) scaleCurrent.add_(-1, squarePrevious) self.scale.mul_(self.alpha / self.size).add_(self.k) torch.pow(self.scale, -self.beta, out=self.output) self.output.mul_(input) return self.output
def model(): mu_latent = pyro.sample("mu_latent", dist.normal, self.mu0, torch.pow(self.lam0, -0.5)) pyro.map_data("aaa", self.data, lambda i, x: pyro.observe( "obs_%d" % i, dist.normal, x, mu_latent, torch.pow(self.lam, -0.5)), batch_size=self.batch_size) return mu_latent
def model(self, reparameterized, difficulty=0.0): next_mean = self.loc0 for k in range(1, self.N + 1): latent_dist = dist.Normal(next_mean, torch.pow(self.lambdas[k - 1], -0.5)) loc_latent = pyro.sample("loc_latent_%d" % k, latent_dist) next_mean = loc_latent loc_N = next_mean with pyro.iarange("data", self.data.size(0)): pyro.sample("obs", dist.Normal(loc_N.expand_as(self.data), torch.pow(self.lambdas[self.N], -0.5).expand_as(self.data)), obs=self.data) return loc_N
def model(*args, **kwargs): next_mean = self.mu0 for k in range(1, self.N + 1): latent_dist = dist.Normal(next_mean, torch.pow(self.lambdas[k - 1], -0.5)) mu_latent = pyro.sample("mu_latent_%d" % k, latent_dist) next_mean = mu_latent mu_N = next_mean for i, x in enumerate(self.data): pyro.observe("obs_%d" % i, dist.normal, x, mu_N, torch.pow(self.lambdas[self.N], -0.5)) return mu_N
def log_norm(x, mu, std): """Compute the log pdf of x, under a normal distribution with mean mu and standard deviation std.""" # print ("X device: ", x.device) # print ("mu device: ", mu.device) # print ("std device: ", std.device) x = x.view(-1) mu = mu.view(-1) std = std.view(-1) return -0.5 * torch.log(2*np.pi*torch.pow(std,2)) \ - 0.5 * (1/torch.pow(std,2))* torch.pow( (x-mu),2)
def mean_dist(source_points,warped_points,L_pck): # compute precentage of correct keypoints batch_size=source_points.size(0) dist=torch.zeros((batch_size)) for i in range(batch_size): p_src = source_points[i,:] p_wrp = warped_points[i,:] N_pts = torch.sum(torch.ne(p_src[0,:],-1)*torch.ne(p_src[1,:],-1)) point_distance = torch.pow(torch.sum(torch.pow(p_src[:,:N_pts]-p_wrp[:,:N_pts],2),0),0.5) L_pck_mat = L_pck[i].expand_as(point_distance) dist[i]=torch.mean(torch.div(point_distance,L_pck_mat)) return dist
def pck(source_points,warped_points,L_pck,alpha=0.1): # compute precentage of correct keypoints batch_size=source_points.size(0) pck=torch.zeros((batch_size)) for i in range(batch_size): p_src = source_points[i,:] p_wrp = warped_points[i,:] N_pts = torch.sum(torch.ne(p_src[0,:],-1)*torch.ne(p_src[1,:],-1)) point_distance = torch.pow(torch.sum(torch.pow(p_src[:,:N_pts]-p_wrp[:,:N_pts],2),0),0.5) L_pck_mat = L_pck[i].expand_as(point_distance) correct_points = torch.le(point_distance,L_pck_mat*alpha) pck[i]=torch.mean(correct_points.float()) return pck
def euclidean_dist(x, y): """ Args: x: pytorch Variable, with shape [m, d] y: pytorch Variable, with shape [n, d] Returns: dist: pytorch Variable, with shape [m, n] """ m, n = x.size(0), y.size(0) xx = torch.pow(x, 2).sum(1, keepdim=True).expand(m, n) yy = torch.pow(y, 2).sum(1, keepdim=True).expand(n, m).t() dist = xx + yy dist.addmm_(1, -2, x, y.t()) dist = dist.clamp(min=1e-12).sqrt() # for numerical stability return dist
def test_save_and_load(self): lin = pyro.module("mymodule", self.linear_module) pyro.module("mymodule2", self.linear_module2) x = torch.randn(1, 3) myparam = pyro.param("myparam", torch.tensor(1.234 * torch.ones(1), requires_grad=True)) cost = torch.sum(torch.pow(lin(x), 2.0)) * torch.pow(myparam, 4.0) cost.backward() params = list(self.linear_module.parameters()) + [myparam] optim = torch.optim.Adam(params, lr=.01) myparam_copy_stale = copy(pyro.param("myparam").detach().cpu().numpy()) optim.step() myparam_copy = copy(pyro.param("myparam").detach().cpu().numpy()) param_store_params = copy(pyro.get_param_store()._params) param_store_param_to_name = copy(pyro.get_param_store()._param_to_name) assert len(list(param_store_params.keys())) == 5 assert len(list(param_store_param_to_name.values())) == 5 pyro.get_param_store().save('paramstore.unittest.out') pyro.clear_param_store() assert len(list(pyro.get_param_store()._params)) == 0 assert len(list(pyro.get_param_store()._param_to_name)) == 0 pyro.get_param_store().load('paramstore.unittest.out') def modules_are_equal(): weights_equal = np.sum(np.fabs(self.linear_module3.weight.detach().cpu().numpy() - self.linear_module.weight.detach().cpu().numpy())) == 0.0 bias_equal = np.sum(np.fabs(self.linear_module3.bias.detach().cpu().numpy() - self.linear_module.bias.detach().cpu().numpy())) == 0.0 return (weights_equal and bias_equal) assert not modules_are_equal() pyro.module("mymodule", self.linear_module3, update_module_params=False) assert id(self.linear_module3.weight) != id(pyro.param('mymodule$$$weight')) assert not modules_are_equal() pyro.module("mymodule", self.linear_module3, update_module_params=True) assert id(self.linear_module3.weight) == id(pyro.param('mymodule$$$weight')) assert modules_are_equal() myparam = pyro.param("myparam") store = pyro.get_param_store() assert myparam_copy_stale != myparam.detach().cpu().numpy() assert myparam_copy == myparam.detach().cpu().numpy() assert sorted(param_store_params.keys()) == sorted(store._params.keys()) assert sorted(param_store_param_to_name.values()) == sorted(store._param_to_name.values()) assert sorted(store._params.keys()) == sorted(store._param_to_name.values())
def backward(self, grad_output): input, output = self.saved_tensors grad_input = grad_output.new() if self._backend is not None: self._backend.SpatialCrossMapLRN_updateGradInput( self._backend.library_state, input, grad_output, grad_input, self.scale, output, self.size, self.alpha, self.beta, self.k ) else: batch_size = input.size(0) channels = input.size(1) input_height = input.size(2) input_width = input.size(3) paddded_ratio = input.new(channels + self.size - 1, input_height, input_width) accum_ratio = input.new(input_height, input_width) cache_ratio_value = 2 * self.alpha * self.beta / self.size inversePrePad = int(self.size - (self.size - 1) / 2) grad_input.resize_as_(input) torch.pow(self.scale, -self.beta, out=grad_input).mul_(grad_output) paddded_ratio.zero_() padded_ratio_center = paddded_ratio.narrow(0, inversePrePad, channels) for n in range(batch_size): torch.mul(grad_output[n], output[n], out=padded_ratio_center) padded_ratio_center.div_(self.scale[n]) torch.sum( paddded_ratio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=accum_ratio) for c in range(channels): accum_ratio.add_(paddded_ratio[c + self.size - 1]) grad_input[n][c].addcmul_(-cache_ratio_value, input[n][c], accum_ratio) accum_ratio.add_(-1, paddded_ratio[c]) return grad_input
def forward(self, input, label): # --------------------------- cos(theta) & phi(theta) --------------------------- if self.device_id == None: cosine = F.linear(F.normalize(input), F.normalize(self.weight)) else: x = input sub_weights = torch.chunk(self.weight, len(self.device_id), dim=0) temp_x = x.cuda(self.device_id[0]) weight = sub_weights[0].cuda(self.device_id[0]) cosine = F.linear(F.normalize(temp_x), F.normalize(weight)) for i in range(1, len(self.device_id)): temp_x = x.cuda(self.device_id[i]) weight = sub_weights[i].cuda(self.device_id[i]) cosine = torch.cat((cosine, F.linear(F.normalize(temp_x), F.normalize(weight)).cuda(self.device_id[0])), dim=1) sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = torch.where(cosine > 0, phi, cosine) else: phi = torch.where(cosine > self.th, phi, cosine - self.mm) # --------------------------- convert label to one-hot --------------------------- one_hot = torch.zeros(cosine.size()) if self.device_id != None: one_hot = one_hot.cuda(self.device_id[0]) one_hot.scatter_(1, label.view(-1, 1).long(), 1) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4 output *= self.s return output
def log_prob_accept(self, value): v = value / self._d y = torch.pow(v, 1.0 / 3.0) x = (y - 1.0) / self._c log_prob_accept = 0.5 * x * x + self._d * (1.0 - v + torch.log(v)) log_prob_accept[y <= 0] = -float('inf') return log_prob_accept
def print_gradients(self, X, Y): """ Print the gradients between the output and X """ print ("--------- GRADIENTS ------------") predictions = self.forward(X) ## Define the loss: loss = torch.sum(torch.pow(predictions - Y, 2)) ## Clean previous gradients self.zero_grad() loss.backward() print (self.linear1.weight.grad) print (self.linear1.bias.grad) print (self.W2.grad) print (self.b2.grad) print ("----------- STRUCTURE ------------") ## Clean previous gradients print(loss.grad_fn) # MSELoss print(loss.grad_fn.next_functions[0][0]) # Linear 1 print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # Sigmoid self.zero_grad()
def singleTagLoss(pred_tag, keypoints): """ associative embedding loss for one image """ eps = 1e-6 tags = [] pull = 0 for i in keypoints: tmp = [] for j in i: if j[1]>0: tmp.append(pred_tag[j[0]]) if len(tmp) == 0: continue tmp = torch.stack(tmp) tags.append(torch.mean(tmp, dim=0)) pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2) if len(tags) == 0: return make_input(torch.zeros([1]).float()), make_input(torch.zeros([1]).float()) tags = torch.stack(tags)[:,0] num = tags.size()[0] size = (num, num, tags.size()[1]) A = tags.unsqueeze(dim=1).expand(*size) B = A.permute(1, 0, 2) diff = A - B diff = torch.pow(diff, 2).sum(dim=2)[:,:,0] push = torch.exp(-diff) push = (torch.sum(push) - num) return push/((num - 1) * num + eps) * 0.5, pull/(num + eps)
def test_regularization(self): penalty = self.model.get_regularization_penalty().data assert (penalty > 0).all() penalty2 = 0 # Config specifies penalty as # "regularizer": [ # ["weight$", {"type": "l2", "alpha": 10}], # ["bias$", {"type": "l1", "alpha": 5}] # ] for name, parameter in self.model.named_parameters(): if name.endswith("weight"): weight_penalty = 10 * torch.sum(torch.pow(parameter, 2)) penalty2 += weight_penalty elif name.endswith("bias"): bias_penalty = 5 * torch.sum(torch.abs(parameter)) penalty2 += bias_penalty assert (penalty == penalty2.data).all() # You get a RuntimeError if you call `model.forward` twice on the same inputs. # The data and config are such that the whole dataset is one batch. training_batch = next(self.iterator(self.instances, num_epochs=1)) validation_batch = next(self.iterator(self.instances, num_epochs=1)) training_loss = self.trainer._batch_loss(training_batch, for_training=True).data validation_loss = self.trainer._batch_loss(validation_batch, for_training=False).data # Training loss should have the regularization penalty, but validation loss should not. assert (training_loss != validation_loss).all() # Training loss should equal the validation loss plus the penalty. penalized = validation_loss + penalty assert (training_loss == penalized).all()
def forward(self, inputs, targets): """ Args: - inputs: feature matrix with shape (batch_size, feat_dim) - targets: ground truth labels with shape (num_classes) """ n = inputs.size(0) # Compute pairwise distance, replace by the official when merged dist = torch.pow(inputs, 2).sum(dim=1, keepdim=True).expand(n, n) dist = dist + dist.t() dist.addmm_(1, -2, inputs, inputs.t()) dist = dist.clamp(min=1e-12).sqrt() # for numerical stability # For each anchor, find the hardest positive and negative mask = targets.expand(n, n).eq(targets.expand(n, n).t()) dist_ap, dist_an = [], [] for i in range(n): dist_ap.append(dist[i][mask[i]].max().unsqueeze(0)) dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0)) dist_ap = torch.cat(dist_ap) dist_an = torch.cat(dist_an) # Compute ranking hinge loss y = torch.ones_like(dist_an) loss = self.ranking_loss(dist_an, dist_ap, y) return loss
def kurtosis_score(x, dim=0): '''Test whether a dataset has normal kurtosis. This function tests the null hypothesis that the kurtosis of the population from which the sample was drawn is that of the normal distribution: ``kurtosis = 3(n-1)/(n+1)``. ripoff from: `scipy.stats.kurtosistest`. Args: a: Array of the sample data axis: Axis along which to compute test. Default is 0. If None, compute over the whole array `a`. Returns: statistic: The computed z-score for this test. p-value: A 2-sided chi squared probability for the hypothesis test. ''' x, n, dim = _x_n_dim(x, dim) if n < 20: raise ValueError( "Number of elements has to be >= 20 to compute kurtosis") b2 = (x**4).mean(dim) / (x**2).mean(dim)**2 E = 3.0 * (n - 1) / (n + 1) varb2 = 24.0 * n * (n - 2) * (n - 3) / ((n + 1)**2 * (n + 3) * (n + 5)) x = (b2 - E) / math.sqrt(varb2) sqrtbeta1 = 6.0 * (n * n - 5 * n + 2) / ((n + 7) * (n + 9)) *\ math.sqrt((6.0 * (n + 3) * (n + 5)) / (n * (n - 2) * (n - 3))) A = 6.0 + 8.0 / sqrtbeta1 * \ (2.0 / sqrtbeta1 + math.sqrt(1 + 4.0 / (sqrtbeta1**2))) term1 = 1 - 2 / (9.0 * A) denom = 1 + x * math.sqrt(2 / (A - 4.0)) term2 = torch.sign(denom) * torch.pow((1 - 2.0 / A) / torch.abs(denom), 1 / 3.0) Z = (term1 - term2) / math.sqrt(2 / (9.0 * A)) return Z, 1 + torch.erf(-math.sqrt(0.5) * torch.abs(Z))
def guide(): pyro.module("mymodule", pt_guide) mu_q, tau_q = torch.exp(pt_guide.mu_q_log), torch.exp(pt_guide.tau_q_log) sigma = torch.pow(tau_q, -0.5) pyro.sample("mu_latent", dist.Normal(mu_q, sigma, reparameterized=reparameterized), baseline=dict(use_decaying_avg_baseline=True))
def updateGradInput(self, input, gradOutput): assert input.dim() == 4 if input.type() == 'torch.cuda.FloatTensor': self._backend.SpatialCrossMapLRN_updateGradInput( self._backend.library_state, input, gradOutput, self.gradInput, self.scale, self.output, self.size, self.alpha, self.beta, self.k ) else: batchSize = input.size(0) channels = input.size(1) inputHeight = input.size(2) inputWidth = input.size(3) if self.paddedRatio is None: self.paddedRatio = input.new() if self.accumRatio is None: self.accumRatio = input.new() self.paddedRatio.resize_(channels + self.size - 1, inputHeight, inputWidth) self.accumRatio.resize_(inputHeight, inputWidth) cacheRatioValue = 2 * self.alpha * self.beta / self.size inversePrePad = int(self.size - (self.size - 1) / 2) self.gradInput.resize_as_(input) torch.pow(self.scale, -self.beta, out=self.gradInput).mul_(gradOutput) self.paddedRatio.zero_() paddedRatioCenter = self.paddedRatio.narrow(0, inversePrePad, channels) for n in range(batchSize): torch.mul(gradOutput[n], self.output[n], out=paddedRatioCenter) paddedRatioCenter.div_(self.scale[n]) torch.sum(self.paddedRatio.narrow(0, 0, self.size - 1), 0, keepdim=False, out=self.accumRatio) for c in range(channels): self.accumRatio.add_(self.paddedRatio[c + self.size - 1]) self.gradInput[n][c].addcmul_(-cacheRatioValue, input[n][c], self.accumRatio) self.accumRatio.add_(-1, self.paddedRatio[c]) return self.gradInput
def forward(self, inputs, targets, step, weight_constraint_lambda, logger): n = inputs.size(0) # Compute pairwise distance, replace by the official when merged # features = F.normalize(inputs) features = inputs dist = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n) dist = dist + dist.t() dist.addmm_(1, -2, features, features.t()) dist = dist.clamp(min=1e-12).sqrt() # for numerical stability # get the positive label mask mask = targets.expand(n, n).eq(targets.expand(n, n).t()) mask = mask.float() positive_dist = torch.mul(dist, mask) negative_dist = torch.mul(mask, dist.max()) + torch.mul(dist, 1 - mask) indexes_ap = [] indexes_ng = [] dist_ap = [] dist_an = [] for i in range(n): pos_dist, pos_index = positive_dist[i].max(0) neg_dist, neg_index = negative_dist[i].min(0) dist_ap.append(pos_dist) dist_an.append(neg_dist) indexes_ap.append(pos_index) indexes_ng.append(neg_index) dist_ap = torch.cat(dist_ap) dist_an = torch.cat(dist_an) indexes_ap = torch.cat(indexes_ap) indexes_ng = torch.cat(indexes_ng) pair_adp_inputs = [] for i in range(n): pair_adp_inputs.append(torch.cat([inputs[i, :], inputs[indexes_ap.data[i], :]])) # for i in range(n): pair_adp_inputs.append(torch.cat([inputs[i, :], inputs[indexes_ng.data[i], :]])) pair_adp_inputs = torch.stack(pair_adp_inputs) # Compute adp_pairwise distance, replace by the official when merged dist_adp = self.AdpsubM(pair_adp_inputs, n) # [2*batchsize] [ap,ng]*batchsize # dist_constraint = torch.norm(dist-dist.t()) dist_ap_adp = dist_adp[::2] dist_an_adp = dist_adp[1::2] # Compute ranking hinge loss y = dist_an.data.new() y.resize_as_(dist_an.data) y.fill_(1) y = Variable(y) # dist_neg_constr = 1/torch.norm(dist[mask==0]) trip_loss = self.softmargin_loss(dist_an - dist_ap, y) trip_loss_adp = self.softmargin_loss(dist_an_adp - dist_ap_adp, y) loss = trip_loss + trip_loss_adp # loss = trip_loss if logger: # logger.scalar_summary('Metric_constraint', Metric_constraint.data[0], step) # logger.scalar_summary('dist_constraint', dist_constraint.data[0], step) # logger.histo_summary('W',W.data.cpu().numpy(),step) logger.histo_summary('dist_apt', dist_adp.data.cpu().numpy(), step) logger.histo_summary('dist', dist.data.cpu().numpy(), step) logger.scalar_summary('trip_loss', trip_loss.data[0], step) prec = (dist_an.data > dist_ap.data).sum() * 1. / y.size(0) return trip_loss_adp, prec
def on_criterion(self, state): """Calculate the decay term and add to state['loss']. :param state: The Model state :type state: dict """ for param in self.params: state['loss'] += self.rate * torch.pow(param, self.p).sum()
def obs_inner(i, _i, _x): for k in range(n_superfluous_top): pyro.sample("z_%d_%d" % (i, k), dist.Normal(ng_zeros(4 - i, 1), ng_ones(4 - i, 1), reparameterized=False)) pyro.observe("obs_%d" % i, dist.normal, _x, mu_latent, torch.pow(self.lam, -0.5)) for k in range(n_superfluous_top, n_superfluous_top + n_superfluous_bottom): pyro.sample("z_%d_%d" % (i, k), dist.Normal(ng_zeros(4 - i, 1), ng_ones(4 - i, 1), reparameterized=False))
def forward(ctx, a, b): tensor, ctx.constant, ctx.tensor_first = sort_args(a, b) if ctx.tensor_first: ctx.save_for_backward(tensor) return tensor.pow(ctx.constant) else: result = torch.pow(ctx.constant, tensor) ctx.save_for_backward(result) return result
def __compute_kl(self, mu): # def _compute_kl(self, mu, sd): # mu_2 = torch.pow(mu, 2) # sd_2 = torch.pow(sd, 2) # encoding_loss = (mu_2 + sd_2 - torch.log(sd_2)).sum() / mu_2.size(0) # return encoding_loss mu_2 = torch.pow(mu, 2) encoding_loss = torch.mean(mu_2) return encoding_loss
def sharpen(mask, temperature): masktemp = torch.pow(mask, temperature) masktempsum = masktemp.sum(dim=1).unsqueeze(dim=1) sharpenmask = masktemp / masktempsum return sharpenmask
def mapping(self, x): x_vec = torch.ones(len(x)).view(1, -1) for i in range(1, self.M): tmp = torch.pow(x, i).view(1, -1) x_vec = torch.cat((x_vec, tmp), 0) return x_vec
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') amsgrad = group['amsgrad'] state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) if len(p.size())!=1: state['followed_weight'] = np.random.randint(p.size(0)),np.random.randint(p.size(1)) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state['max_exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] if amsgrad: max_exp_avg_sq = state['max_exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 if group['weight_decay'] != 0: grad.add_(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group['eps']) else: denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 binary_weight_before_update = torch.sign(p.data) condition_consolidation = (torch.mul(binary_weight_before_update,exp_avg) > 0.0 ) decayed_exp_avg = torch.mul(torch.ones_like(p.data)-torch.pow(torch.tanh(group['meta']*torch.abs(p.data)),2) ,exp_avg) if len(p.size())==1: # True if p is bias, false if p is weight p.data.addcdiv_(-step_size, exp_avg, denom) else: #p.data.addcdiv_(-step_size, exp_avg , denom) #normal update p.data.addcdiv_(-step_size, torch.where(condition_consolidation, decayed_exp_avg, exp_avg) , denom) #assymetric lr for metaplasticity return loss
def task_error(self, w, x, y): self._validate_inputs(w, x, y) # Compute mean squared error error = torch.mean(torch.pow(torch.mm(x, w) - y.view(-1, 1), 2)) return error
def forward(self, classifications, regressions, anchors, annotations): alpha = 0.25 gamma = 2.0 batch_size = classifications.shape[0] classification_losses = [] regression_losses = [] anchor = anchors[0, :, :] anchor_widths = anchor[:, 2] - anchor[:, 0] anchor_heights = anchor[:, 3] - anchor[:, 1] anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights for j in range(batch_size): classification = classifications[j, :, :] regression = regressions[j, :, :] bbox_annotation = annotations[j, :, :] bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] if bbox_annotation.shape[0] == 0: if torch.cuda.is_available(): regression_losses.append(torch.tensor(0).float().cuda()) classification_losses.append( torch.tensor(0).float().cuda()) else: regression_losses.append(torch.tensor(0).float()) classification_losses.append(torch.tensor(0).float()) continue classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) IoU_max, IoU_argmax = torch.max(IoU, dim=1) # compute the loss for classification targets = torch.ones(classification.shape) * -1 if torch.cuda.is_available(): targets = targets.cuda() targets[torch.lt(IoU_max, 0.4), :] = 0 positive_indices = torch.ge(IoU_max, 0.5) num_positive_anchors = positive_indices.sum() assigned_annotations = bbox_annotation[IoU_argmax, :] targets[positive_indices, :] = 0 targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 alpha_factor = torch.ones(targets.shape) * alpha if torch.cuda.is_available(): alpha_factor = alpha_factor.cuda() alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) focal_weight = alpha_factor * torch.pow(focal_weight, gamma) bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) cls_loss = focal_weight * bce zeros = torch.zeros(cls_loss.shape) if torch.cuda.is_available(): zeros = zeros.cuda() cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, zeros) classification_losses.append( cls_loss.sum() / torch.clamp(num_positive_anchors.float(), min=1.0)) if positive_indices.sum() > 0: assigned_annotations = assigned_annotations[ positive_indices, :] anchor_widths_pi = anchor_widths[positive_indices] anchor_heights_pi = anchor_heights[positive_indices] anchor_ctr_x_pi = anchor_ctr_x[positive_indices] anchor_ctr_y_pi = anchor_ctr_y[positive_indices] gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights gt_widths = torch.clamp(gt_widths, min=1) gt_heights = torch.clamp(gt_heights, min=1) targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi targets_dw = torch.log(gt_widths / anchor_widths_pi) targets_dh = torch.log(gt_heights / anchor_heights_pi) targets = torch.stack( (targets_dx, targets_dy, targets_dw, targets_dh)) targets = targets.t() norm = torch.Tensor([[0.1, 0.1, 0.2, 0.2]]) if torch.cuda.is_available(): norm = norm.cuda() targets = targets / norm regression_diff = torch.abs(targets - regression[positive_indices, :]) regression_loss = torch.where( torch.le(regression_diff, 1.0 / 9.0), 0.5 * 9.0 * torch.pow(regression_diff, 2), regression_diff - 0.5 / 9.0) regression_losses.append(regression_loss.mean()) else: if torch.cuda.is_available(): regression_losses.append(torch.tensor(0).float().cuda()) else: regression_losses.append(torch.tensor(0).float()) return torch.stack(classification_losses).mean( dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)
) if cfg.METHOD == "tau_norm": model_state_dict = model.state_dict() # set bias as zero model_state_dict['module.classifier.bias'].copy_(torch.zeros( (num_classes))) weight_ori = model_state_dict['module.classifier.weight'] norm_weight = torch.norm(weight_ori, 2, 1) best_accuracy = 0 best_p = 0 for p in np.arange(0.0, 1.0, 0.1): ws = weight_ori.clone() for i in range(weight_ori.size(0)): ws[i] = ws[i] / torch.pow(norm_weight[i], p) model_state_dict['module.classifier.weight'].copy_(ws) print("\n___________________________", p, "__________________________________") acc, _ = valid_model(testLoader, model, num_classes, para_dict_train, para_dict_test,criterion, LOSS_RATIO=0) if acc > best_accuracy: best_accuracy = acc best_p = p print("when p is", best_p, ", best result is", best_accuracy) elif cfg.METHOD == "BPM": best_accuracy = 0 best_LOSS_RATIO = 0 for LOSS_RATIO in np.arange(0.0, 2.0, 0.1): print("\n___________________________", LOSS_RATIO, "__________________________________") acc, acc_per_class = valid_model(testLoader, model, num_classes, para_dict_train, para_dict_test, criterion, LOSS_RATIO)
def l2norm(X, dim, eps=1e-8): """L2-normalize columns of X """ norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps X = torch.div(X, norm) return X
def psi(a: torch.Tensor) -> torch.Tensor: """Quadratic penalty function.""" return torch.pow(torch.max(torch.zeros_like(a), a), 2)
def load_data(dataset_name, splits_file_path=None, train_percentage=None, val_percentage=None, embedding_mode=None, embedding_method=None, embedding_method_graph=None, embedding_method_space=None): if dataset_name in {'cora', 'citeseer', 'pubmed'}: adj, features, labels, _, _, _ = utils.load_data(dataset_name) labels = np.argmax(labels, axis=-1) features = features.todense() G = nx.DiGraph(adj) else: graph_adjacency_list_file_path = os.path.join('new_data', dataset_name, 'out1_graph_edges.txt') graph_node_features_and_labels_file_path = os.path.join( 'new_data', dataset_name, f'out1_node_feature_label.txt') G = nx.DiGraph() graph_node_features_dict = {} graph_labels_dict = {} if dataset_name == 'film': with open(graph_node_features_and_labels_file_path ) as graph_node_features_and_labels_file: graph_node_features_and_labels_file.readline() for line in graph_node_features_and_labels_file: line = line.rstrip().split('\t') assert (len(line) == 3) assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict) feature_blank = np.zeros(932, dtype=np.uint8) feature_blank[np.array(line[1].split(','), dtype=np.uint16)] = 1 graph_node_features_dict[int(line[0])] = feature_blank graph_labels_dict[int(line[0])] = int(line[2]) else: with open(graph_node_features_and_labels_file_path ) as graph_node_features_and_labels_file: graph_node_features_and_labels_file.readline() for line in graph_node_features_and_labels_file: line = line.rstrip().split('\t') assert (len(line) == 3) assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict) graph_node_features_dict[int(line[0])] = np.array( line[1].split(','), dtype=np.uint8) graph_labels_dict[int(line[0])] = int(line[2]) with open(graph_adjacency_list_file_path) as graph_adjacency_list_file: graph_adjacency_list_file.readline() for line in graph_adjacency_list_file: line = line.rstrip().split('\t') assert (len(line) == 2) if int(line[0]) not in G: G.add_node(int(line[0]), features=graph_node_features_dict[int(line[0])], label=graph_labels_dict[int(line[0])]) if int(line[1]) not in G: G.add_node(int(line[1]), features=graph_node_features_dict[int(line[1])], label=graph_labels_dict[int(line[1])]) G.add_edge(int(line[0]), int(line[1])) adj = nx.adjacency_matrix(G, sorted(G.nodes())) features = np.array([ features for _, features in sorted(G.nodes(data='features'), key=lambda x: x[0]) ]) labels = np.array([ label for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0]) ]) features = utils.preprocess_features(features) if not embedding_mode: g = DGLGraph(adj + sp.eye(adj.shape[0])) else: if embedding_mode == 'ExperimentTwoAll': embedding_file_path = os.path.join( 'embedding_method_combinations_all', f'outf_nodes_relation_{dataset_name}all_embedding_methods.txt') elif embedding_mode == 'ExperimentTwoPairs': embedding_file_path = os.path.join( 'embedding_method_combinations_in_pairs', f'outf_nodes_relation_{dataset_name}_graph_{embedding_method_graph}_space_{embedding_method_space}.txt' ) else: embedding_file_path = os.path.join( 'structural_neighborhood', f'outf_nodes_space_relation_{dataset_name}_{embedding_method}.txt' ) space_and_relation_type_to_idx_dict = {} with open(embedding_file_path) as embedding_file: for line in embedding_file: if line.rstrip() == 'node1,node2 space relation_type': continue line = re.split(r'[\t,]', line.rstrip()) assert (len(line) == 4) assert (int(line[0]) in G and int(line[1]) in G) if (line[2], int( line[3])) not in space_and_relation_type_to_idx_dict: space_and_relation_type_to_idx_dict[(line[2], int( line[3]))] = len(space_and_relation_type_to_idx_dict) if G.has_edge(int(line[0]), int(line[1])): G.remove_edge(int(line[0]), int(line[1])) G.add_edge(int(line[0]), int(line[1]), subgraph_idx=space_and_relation_type_to_idx_dict[( line[2], int(line[3]))]) space_and_relation_type_to_idx_dict['self_loop'] = len( space_and_relation_type_to_idx_dict) for node in sorted(G.nodes()): if G.has_edge(node, node): G.remove_edge(node, node) G.add_edge( node, node, subgraph_idx=space_and_relation_type_to_idx_dict['self_loop']) adj = nx.adjacency_matrix(G, sorted(G.nodes())) g = DGLGraph(adj) for u, v, feature in G.edges(data='subgraph_idx'): g.edges[g.edge_id(u, v)].data['subgraph_idx'] = th.tensor([feature]) if splits_file_path: with np.load(splits_file_path) as splits_file: train_mask = splits_file['train_mask'] val_mask = splits_file['val_mask'] test_mask = splits_file['test_mask'] else: assert (train_percentage is not None and val_percentage is not None) assert (train_percentage < 1.0 and val_percentage < 1.0 and train_percentage + val_percentage < 1.0) if dataset_name in {'cora', 'citeseer'}: disconnected_node_file_path = os.path.join( 'unconnected_nodes', f'{dataset_name}_unconnected_nodes.txt') with open(disconnected_node_file_path) as disconnected_node_file: disconnected_node_file.readline() disconnected_nodes = [] for line in disconnected_node_file: line = line.rstrip() disconnected_nodes.append(int(line)) disconnected_nodes = np.array(disconnected_nodes) connected_nodes = np.setdiff1d(np.arange(features.shape[0]), disconnected_nodes) connected_labels = labels[connected_nodes] train_and_val_index, test_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage + val_percentage).split( np.empty_like(connected_labels), connected_labels)) train_index, val_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage).split( np.empty_like(connected_labels[train_and_val_index]), connected_labels[train_and_val_index])) train_index = train_and_val_index[train_index] val_index = train_and_val_index[val_index] train_mask = np.zeros_like(labels) train_mask[connected_nodes[train_index]] = 1 val_mask = np.zeros_like(labels) val_mask[connected_nodes[val_index]] = 1 test_mask = np.zeros_like(labels) test_mask[connected_nodes[test_index]] = 1 else: train_and_val_index, test_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage + val_percentage).split(np.empty_like(labels), labels)) train_index, val_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage).split( np.empty_like(labels[train_and_val_index]), labels[train_and_val_index])) train_index = train_and_val_index[train_index] val_index = train_and_val_index[val_index] train_mask = np.zeros_like(labels) train_mask[train_index] = 1 val_mask = np.zeros_like(labels) val_mask[val_index] = 1 test_mask = np.zeros_like(labels) test_mask[test_index] = 1 num_features = features.shape[1] num_labels = len(np.unique(labels)) assert (np.array_equal(np.unique(labels), np.arange(len(np.unique(labels))))) features = th.FloatTensor(features) labels = th.LongTensor(labels) train_mask = th.BoolTensor(train_mask) val_mask = th.BoolTensor(val_mask) test_mask = th.BoolTensor(test_mask) # Adapted from https://docs.dgl.ai/tutorials/models/1_gnn/1_gcn.html degs = g.in_degrees().float() norm = th.pow(degs, -0.5).cuda() norm[th.isinf(norm)] = 0 g.ndata['norm'] = norm.unsqueeze(1) return g, features, labels, train_mask, val_mask, test_mask, num_features, num_labels
def angle_defn(pos, i, d_model_size): angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size) return pos * angle_rates
import math import numpy as np import scipy as sp import scipy.linalg import torch import torch.nn as nn import torch.nn.init as init import torch.nn.functional as F from nf.utils import unconstrained_RQS # supported non-linearities: note that the function must be invertible functional_derivatives = { torch.tanh: lambda x: 1 - torch.pow(torch.tanh(x), 2), F.leaky_relu: lambda x: (x > 0).type(torch.FloatTensor) + \ (x < 0).type(torch.FloatTensor) * -0.01, F.elu: lambda x: (x > 0).type(torch.FloatTensor) + \ (x < 0).type(torch.FloatTensor) * torch.exp(x) } class Planar(nn.Module): """ Planar flow. z = f(x) = x + u h(wᵀx + b) [Rezende and Mohamed, 2015] """ def __init__(self, dim, nonlinearity=torch.tanh): super().__init__() self.h = nonlinearity
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, max_y, max_x): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) reduced_guides = [] att_guide_path = hp.attention_path for j, item_id in enumerate(ids): att = np.load(f'{att_guide_path}/{item_id}.npy') reduced = att[0::r] pred_attention = attention[j] n_frames = pred_attention.shape[0] n_phones = pred_attention.shape[-1] # pred_attention = torch.tensor(pred_attention) # reduced = torch.tensor(reduced) padded_guides = pad2d_nonzero(reduced, n_frames, n_phones) #padded_guides = torch.tensor(padded_guides) reduced_guides.append(padded_guides) reduced_guides = torch.tensor(reduced_guides) mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor) mask = torch.tensor(mask) padded_guides = [ pad2d_zero(x, n_frames, n_phones) for x in reduced_guides ] padded_guides = torch.tensor(padded_guides) padded_guides = padded_guides.to(device) attention = attention.to(device) mask = mask.to(device) attention = attention * mask print("guide att shape", att.shape) print(att) print("reduced guide", padded_guides.shape) # print("attention size",n_frames, n_phones) print("mask", mask.shape) print(mask) print(padded_guides.shape, attention.shape, mask.shape) print(attention) print(padded_guides) multiply = torch.pow((attention - padded_guides), 2) print(multiply) #multiply = torch.pow((pred_attention - padded_guides),2)* mask #print(multiply) attention_loss = torch.sum(multiply) print(attention_loss) mask_sum1 = torch.sum(mask) attention_loss /= mask_sum1 print(attention_loss) # batch_attention_losses.append(attention_loss) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses) #print("attention loss", average_att_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss print("loss + att", loss) #exit() optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def train(self, epoch, max_epoch, writer, print_freq=10, fixbase_epoch=0, open_layers=None): losses_t = AverageMeter() losses_x = AverageMeter() accs = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() loss_meter = AverageMeter() self.model.train() if (epoch + 1) <= fixbase_epoch and open_layers is not None: print('* Only train {} (epoch: {}/{})'.format( open_layers, epoch + 1, fixbase_epoch)) open_specified_layers(self.model, open_layers) else: open_all_layers(self.model) num_batches = len(self.train_loader) end = time.time() layer_nums = 3 for batch_idx, data in enumerate(self.train_loader): data_time.update(time.time() - end) imgs, pids = self._parse_data_for_train(data) if self.use_gpu: imgs = imgs.cuda() pids = pids.cuda() self.optimizer.zero_grad() outputs, features, h, b, y_resnet, mgn_1, mgn_2, mgn_3 = self.model( imgs) #print(len(logits_list)) #print(logits_list[0].shape) pids_g = self.parse_pids(pids) x = features target_b = F.cosine_similarity(b[:pids_g.size(0) // 2], b[pids_g.size(0) // 2:]) target_x = F.cosine_similarity(x[:pids_g.size(0) // 2], x[pids_g.size(0) // 2:]) loss1 = F.mse_loss(target_b, target_x) loss2 = torch.mean( torch.abs( torch.pow( torch.abs(h) - Variable(torch.ones(h.size()).cuda()), 3))) loss_greedy = loss1 + 0.1 * loss2 loss_batchhard_hash = self.compute_hashbatchhard(b, pids) loss_t = self._compute_loss(self.criterion_t, features, pids) loss_x = self._compute_loss( self.criterion_x, outputs, pids) + self._compute_loss( self.criterion_x, y_resnet, pids) + self._compute_loss( self.criterion_x, mgn_1, pids) + self._compute_loss( self.criterion_x, mgn_2, pids) + self._compute_loss( self.criterion_x, mgn_3, pids) loss = self.weight_t * loss_t + self.weight_x * loss_x + loss_greedy + loss_batchhard_hash * 2 loss.backward() self.optimizer.step() batch_time.update(time.time() - end) losses_t.update(loss_t.item(), pids.size(0)) losses_x.update(loss_x.item(), pids.size(0)) accs.update(metrics.accuracy(outputs, pids)[0].item()) if (batch_idx + 1) % print_freq == 0: # estimate remaining time eta_seconds = batch_time.avg * (num_batches - (batch_idx + 1) + (max_epoch - (epoch + 1)) * num_batches) eta_str = str(datetime.timedelta(seconds=int(eta_seconds))) print('Epoch: [{0}/{1}][{2}/{3}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss_t {loss_t.val:.4f} ({loss_t.avg:.4f})\t' 'Loss_x {loss_x.val:.4f} ({loss_x.avg:.4f})\t' 'Loss_g {loss_g:.4f} )\t' 'Loss_p {loss_p:.4f} )\t' 'Acc {acc.val:.2f} ({acc.avg:.2f})\t' 'Lr {lr:.6f}\t' 'eta {eta}'.format( epoch + 1, max_epoch, batch_idx + 1, num_batches, batch_time=batch_time, data_time=data_time, loss_t=losses_t, loss_x=losses_x, loss_g=loss_greedy, loss_p=loss_batchhard_hash, acc=accs, lr=self.optimizer.param_groups[0]['lr'], eta=eta_str)) if writer is not None: n_iter = epoch * num_batches + batch_idx writer.add_scalar('Train/Time', batch_time.avg, n_iter) writer.add_scalar('Train/Data', data_time.avg, n_iter) writer.add_scalar('Train/Loss_t', losses_t.avg, n_iter) writer.add_scalar('Train/Loss_x', losses_x.avg, n_iter) writer.add_scalar('Train/Acc', accs.avg, n_iter) writer.add_scalar('Train/Lr', self.optimizer.param_groups[0]['lr'], n_iter) end = time.time() if self.scheduler is not None: self.scheduler.step()
def expected_val(pred): n, m, d = pred.size() p = Variable(torch.arange(1, 6)).view((5, 1)) if CUDA: p = p.cuda() return torch.mm(softmax(pred).view((n * m, d)), p).view((n, m, 1)) epochs = 1000 for ep in xrange(epochs): optimizer.zero_grad() #print(train_id.size(), train_mask.size()) embeddings = enc(train_id, train_mask) y_hat = dec(embeddings, train_mask) train_loss = ce(y_hat, train_id, train_mask) reg_loss = 0 for p in pars: reg_loss += torch.sum(torch.pow(p, 2)) loss = train_loss + 0.0001 * reg_loss loss.backward() mse_train = mse(expected_val(y_hat), train_x, train_mask) optimizer.step() if ep % 1 == 0: val_hat = dec(enc(train_id, train_mask), val_mask) mse_val = mse(expected_val(val_hat), val_x, val_mask) val_loss = np.sqrt(mse_val.data[0]) print( 'Train Epoch: {}, Loss: {:.6f}, MSE: {:.6f}, Val_loss: {:.6f}'.format( ep, loss.data[0], np.sqrt(mse_train.data[0]), val_loss))
import torch if __name__ == "__main__": x = torch.randn(3, 2).cuda().requires_grad_() y = torch.randn(3, 2).cuda().requires_grad_() k = 3 torch.sqrt(torch.pow(x.unsqueeze(0) - y.unsqueeze(1), 2).sum(dim=2)).sum().backward() print(x.grad, y.grad)
def forward(self, x1, x2): assert x1.size() == x2.size() eps = 1e-4 / x1.size(1) diff = torch.abs(x1 - x2) out = torch.pow(diff, self.norm).sum(dim=1) return torch.pow(out + eps, 1. / self.norm)
def forward(self, x, U, V, N, eta): """ x: B x C x H x W U: B x (C x K) x H x W V: B x (C x K) x H x W N: B x (C x K) x H x W """ B, C, H, W = x.shape B, CK, H, W = U.shape K = int(CK / C) S2 = torch.clamp(V - torch.pow(U, 2), min=0.01) S = torch.sqrt(S2) # X_cat: B x CK x H x W X_cat = torch.cat([ torch.cat([x[:, i:i + 1, :, :] for _ in range(K)], dim=1) # X_cat[:, i*K:(i+1)*K, :, :] corresponds to a feature map with K mixtures for i in range(C) ], dim=1) XdU = X_cat - U # B x CK x H x W XdUoS = XdU / S # B x CK x H x W XdUoS2 = torch.pow(XdUoS, 2) # B x CK x H x W nTotal = 1 / eta - 1 # scalar N = torch.cat([ nTotal * N[:, i * K:(i + 1) * K, :, :] / torch.sum(N[:, i * K:(i + 1) * K, :, :], dim=1, keepdim=True) for i in range(C) ], dim=1) assert N.shape == torch.Size(np.array([B, CK, H, W])) P = N / nTotal # P: B x CK x H x W assert P.shape == torch.Size(np.array([B, CK, H, W])) # cdf: B x CK x H x W cdf = torch.cat([ Normal(0, 1).cdf(torch.abs(XdUoS[:, i:i + 1, :, :])) for i in range(CK) ], dim=1) assert cdf.shape == torch.Size(np.array([B, CK, H, W])) # prob: B x CK x H x W prob = torch.cat([ torch.sum(P[:, i * K:(i + 1) * K, :, :] * cdf[:, i * K:(i + 1) * K, :, :], dim=1, keepdim=True) for i in range(C) ], dim=1) assert prob.shape == torch.Size(np.array([B, C, H, W])) log_prob = torch.log(N) + -0.5 * XdUoS2 - torch.log(S) # Gamma = nn.Softmax(dim=1)(log_prob) Gamma = torch.cat([ nn.Softmax(dim=1)(log_prob[:, i * K:(i + 1) * K, :, :]) for i in range(C) ], dim=1) N = N + Gamma Eta = Gamma / N U = U + Eta * (X_cat - U) V = V + Eta * (torch.pow(X_cat, 2) - V) return U, V, N, prob
def gelu(x): return 0.5 * x * (1 + torch.tanh( math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
def forward(self, x): x = torch.mean(x, 1, keepdim=True) mean = self.pool(x) return torch.mean( torch.pow(mean - torch.FloatTensor([self.mean_val]).cuda(), 2))
def kl_loss(y: torch.Tensor) -> torch.Tensor: x_2 = torch.pow(y, 2) loss = torch.mean(x_2) return loss
def mse(output, target): return torch.mean(torch.pow(output - target, 2))
def variance(self): return _moments(self.concentration1, self.concentration0, 2) - torch.pow(self.mean, 2)
def power(self, tensor_in_1, tensor_in_2): tensor_in_1 = self.astensor(tensor_in_1) tensor_in_2 = self.astensor(tensor_in_2) return torch.pow(tensor_in_1, tensor_in_2)
def predict_given_factorizations(self, m, s, iK, beta): """ Approximate GP regression at noisy inputs via moment matching IN: mean (m) (row vector) and (s) variance of the state OUT: mean (M) (row vector), variance (S) of the action and inv(s)*input-ouputcovariance """ if type(m) != torch.Tensor or type(s) != torch.Tensor: m = torch.tensor(m).float().cuda() s = torch.tensor(s).float().cuda() print( "Warning: gradient may break in mgpr.predict_given_factorizations" ) s = s.repeat(self.num_outputs, self.num_outputs, 1, 1) inp = self.centralized_input(m) # Calculate M and V: mean and inv(s) times input-output covariance iL = torch.diag_embed( 1 / (self.model.covar_module.base_kernel.lengthscale.squeeze(1))) iN = inp @ iL B = iL @ s[0, ...] @ iL + torch.eye(self.num_dims).float().cuda() # Redefine iN as in^T and t --> t^T # B is symmetric so its the same t, _ = torch.solve(torch.transpose(iN, dim0=1, dim1=2), B) t = torch.transpose(t, dim0=1, dim1=2) lb = torch.exp(-torch.sum(iN * t, -1) / 2) * beta tiL = t @ iL t_det = torch.det(B) c = self.model.covar_module.outputscale / torch.sqrt(t_det) M = (torch.sum(lb, -1) * c)[:, None] V = (torch.transpose(tiL, dim0=1, dim1=2) @ lb[:, :, None])[..., 0] * c[:, None] # Calculate S: Predictive Covariance R_0 = torch.diag_embed(1 / torch.pow( self.model.covar_module.base_kernel.lengthscale.squeeze(1)[ None, :, :], 2) + 1 / torch.pow( self.model.covar_module.base_kernel.lengthscale.squeeze(1) [:, None, :], 2)) R = s @ R_0 + torch.eye(self.num_dims).float().cuda() # TODO: change this block according to the PR of tensorflow. Maybe move it into a function? X = inp[None, :, :, :] / torch.pow( self.model.covar_module.base_kernel.lengthscale.squeeze(1) [:, None, None, :], 2) X2 = -inp[:, None, :, :] / torch.pow( self.model.covar_module.base_kernel.lengthscale.squeeze(1)[ None, :, None, :], 2) q_x, _ = torch.solve(s, R) Q = q_x / 2 Xs = torch.sum(X @ Q * X, -1) X2s = torch.sum(X2 @ Q * X2, -1) maha = -2 * ((X @ Q) @ torch.transpose(X2,dim0=2,dim1=3)) + \ Xs[:, :, :, None] + X2s[:, :, None, :] # k = torch.log(self.model.covar_module.outputscale)[:, None] - \ torch.sum(torch.pow(iN,2), -1)/2 L = torch.exp(k[:, None, :, None] + k[None, :, None, :] + maha) S = beta[:, None, None, :].repeat(1, self.num_outputs, 1, 1) S = (beta[:, None, None, :].repeat(1, self.num_outputs, 1, 1) @ L @ beta[None, :, :, None].repeat(self.num_outputs, 1, 1, 1))[:, :, 0, 0] diagL = torch.diagonal(L.permute((3, 2, 1, 0)), dim1=-2, dim2=-1).permute(2, 1, 0) S = S - torch.diag_embed(torch.sum((iK * diagL), [1, 2])) r_det = torch.det(R) S = S / torch.sqrt(r_det) S = S + torch.diag_embed(self.model.covar_module.outputscale) S = S - M @ M.t() return M.t(), S, V.t()
def forward(self, dist): dist = dist.view(-1, 1) - self.offset.view(1, -1) return torch.exp(self.coeff * torch.pow(dist, 2))
def compute_length_penalty(wl1, wl2, alpha=0.25): x = torch.stack((wl1.squeeze(), wl2.squeeze()), dim=1) x_min, _ = torch.min(x, dim=1) x_max, _ = torch.max(x, dim=1) ratio = x_max.float() / x_min.float() return torch.pow(torch.exp(1 - ratio.float()), alpha)
def rect_to_polar(real, imag): mag = torch.pow(real**2 + imag**2, 0.5) ang = torch.atan2(imag, real) return mag, ang
def forward(self, output, clip_label, motion_mask): z = torch.pow((output - clip_label), 2) loss = torch.mean(motion_mask * z) return loss
def _compute_kl(self, mu, sd): mu_2 = torch.pow(mu, 2) sd_2 = torch.pow(sd, 2) encoding_loss = (mu_2 + sd_2 - torch.log(sd_2)).sum() / mu_2.size(0) return encoding_loss
""" frame_dir = "/Users/lekhang/Desktop/Khang/data/highway/input" frame_files = general_utils.get_all_files(f"{frame_dir}", keep_dir=True) frame_files = sorted(frame_files) frame_0 = cv2.imread(frame_files[0], 0) h, w = frame_0.shape U = np.array([np.array(cv2.imread(frame_files[i], 0).flatten()) / 255. for i in range(k)]).T U = np.random.rand(*U.shape) # TODO: set this make the result look very good - why? assert U.shape == (h * w, k) V = U ** 2 N = np.ones((h * w, k)) U2 = torch.from_numpy(np.random.rand(1, c * k, h, w)).float() V2 = torch.pow(U2, 2) N2 = torch.ones((1, c * k, h, w)).float() gmm_tensor = GMMBlock() for frame_file in frame_files: frame = cv2.imread(frame_file, 0) frame_rgb = cv2.imread(frame_file) frame = frame / 255. frame_rgb = frame_rgb / 255. U, V, N, prob = gmm(U, V, N, np.expand_dims(frame.flatten(), axis=-1), eta) frame_tensor = torch.from_numpy(np.expand_dims(np.moveaxis(frame_rgb[:, :, :], -1, 0), axis=0)).float() U2, V2, N2, prob2 = gmm_tensor(frame_tensor, U2, V2, N2, eta)