def forward(self, theta, matches, return_outliers=False): if isinstance(theta,Variable): # handle normal batch transformations batch_size=theta.size()[0] theta=theta.clone() mask = self.geometricTnf(expand_dim(self.mask_id,0,batch_size),theta) if return_outliers: mask_outliers = self.geometricTnf(expand_dim(1.0-self.mask_id,0,batch_size),theta) if self.normalize: epsilon=1e-5 mask = torch.div(mask, torch.sum(torch.sum(torch.sum(mask+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask)) if return_outliers: mask_outliers = torch.div(mask_outliers, torch.sum(torch.sum(torch.sum(mask_outliers+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask_outliers)) score = torch.sum(torch.sum(torch.sum(torch.mul(mask,matches),3),2),1) if return_outliers: score_outliers = torch.sum(torch.sum(torch.sum(torch.mul(mask_outliers,matches),3),2),1) return (score,score_outliers) elif isinstance(theta,list): # handle multiple transformations per batch item, batch is in list format (used for RANSAC) batch_size = len(theta) score = [] for b in range(batch_size): sample_size=theta[b].size(0) s=self.forward(theta[b],expand_dim(matches[b,:,:,:].unsqueeze(0),0,sample_size)) score.append(s) return score
def forward(self, x, targets): batchSize = x.size(0) K = x.size(1)-1 Pnt = 1 / float(self.nLem) Pns = 1 / float(self.nLem) # eq 5.1 : P(origin=model) = Pmt / (Pmt + k*Pnt) Pmt = x.select(1,0) Pmt_div = Pmt.add(K * Pnt + eps) lnPmt = torch.div(Pmt, Pmt_div) # eq 5.2 : P(origin=noise) = k*Pns / (Pms + k*Pns) Pon_div = x.narrow(1,1,K).add(K * Pns + eps) Pon = Pon_div.clone().fill_(K * Pns) lnPon = torch.div(Pon, Pon_div) # equation 6 in ref. A lnPmt.log_() lnPon.log_() lnPmtsum = lnPmt.sum(0) lnPonsum = lnPon.view(-1, 1).sum(0) loss = - (lnPmtsum + lnPonsum) / batchSize return loss
def step(self, step, lprobs, scores): super()._init_buffers(lprobs) bsz, beam_size, vocab_size = lprobs.size() if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam lprobs = lprobs[:, ::beam_size, :].contiguous() else: # make probs contain cumulative scores for each hypothesis lprobs.add_(scores[:, :, step - 1].unsqueeze(-1)) torch.topk( lprobs.view(bsz, -1), k=min( # Take the best 2 x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. beam_size * 2, lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad ), out=(self.scores_buf, self.indices_buf), ) torch.div(self.indices_buf, vocab_size, out=self.beams_buf) self.indices_buf.fmod_(vocab_size) return self.scores_buf, self.indices_buf, self.beams_buf
def updateOutput(self, input, y): input1, input2 = input[0], input[1] # keep backward compatibility if self.buffer is None: self.buffer = input1.new() self.w1 = input1.new() self.w22 = input1.new() self.w = input1.new() self.w32 = input1.new() self._outputs = input1.new() # comparison operators behave differently from cuda/c implementations # TODO: verify name if input1.type() == 'torch.cuda.FloatTensor': self._idx = torch.cuda.ByteTensor() else: self._idx = torch.ByteTensor() torch.mul(input1, input2, out=self.buffer) torch.sum(self.buffer, 1, out=self.w1, keepdim=True) epsilon = 1e-12 torch.mul(input1, input1, out=self.buffer) torch.sum(self.buffer, 1, out=self.w22, keepdim=True).add_(epsilon) # self._outputs is also used as a temporary buffer self._outputs.resize_as_(self.w22).fill_(1) torch.div(self._outputs, self.w22, out=self.w22) self.w.resize_as_(self.w22).copy_(self.w22) torch.mul(input2, input2, out=self.buffer) torch.sum(self.buffer, 1, out=self.w32, keepdim=True).add_(epsilon) torch.div(self._outputs, self.w32, out=self.w32) self.w.mul_(self.w32) self.w.sqrt_() torch.mul(self.w1, self.w, out=self._outputs) self._outputs = self._outputs.select(1, 0) torch.eq(y, -1, out=self._idx) self._outputs[self._idx] = self._outputs[self._idx].add_(-self.margin).clamp_(min=0) torch.eq(y, 1, out=self._idx) self._outputs[self._idx] = self._outputs[self._idx].mul_(-1).add_(1) self.output = self._outputs.sum().item() if self.sizeAverage: self.output = self.output / y.size(0) return self.output
def normalize_batch(batch): # normalize using imagenet mean and std mean = batch.data.new(batch.data.size()) std = batch.data.new(batch.data.size()) mean[:, 0, :, :] = 0.485 mean[:, 1, :, :] = 0.456 mean[:, 2, :, :] = 0.406 std[:, 0, :, :] = 0.229 std[:, 1, :, :] = 0.224 std[:, 2, :, :] = 0.225 batch = torch.div(batch, 255.0) batch -= Variable(mean) # batch /= Variable(std) batch = torch.div(batch,Variable(std)) return batch
def forward(self, true_binary, rule_masks, raw_logits): if cmd_args.loss_type == 'binary': exp_pred = torch.exp(raw_logits) * rule_masks norm = F.torch.sum(exp_pred, 2, keepdim=True) prob = F.torch.div(exp_pred, norm) return F.binary_cross_entropy(prob, true_binary) * cmd_args.max_decode_steps if cmd_args.loss_type == 'perplexity': return my_perp_loss(true_binary, rule_masks, raw_logits) if cmd_args.loss_type == 'vanilla': exp_pred = torch.exp(raw_logits) * rule_masks + 1e-30 norm = torch.sum(exp_pred, 2, keepdim=True) prob = torch.div(exp_pred, norm) ll = F.torch.abs(F.torch.sum( true_binary * prob, 2)) mask = 1 - rule_masks[:, :, -1] logll = mask * F.torch.log(ll) loss = -torch.sum(logll) / true_binary.size()[1] return loss print('unknown loss type %s' % cmd_args.loss_type) raise NotImplementedError
def encode(self, indices, lengths, noise): embeddings = self.embedding(indices) packed_embeddings = pack_padded_sequence(input=embeddings, lengths=lengths, batch_first=True) # Encode packed_output, state = self.encoder(packed_embeddings) hidden, cell = state # batch_size x nhidden hidden = hidden[-1] # get hidden state of last layer of encoder # normalize to unit ball (l2 norm of 1) - p=2, dim=1 norms = torch.norm(hidden, 2, 1) # For older versions of PyTorch use: hidden = torch.div(hidden, norms.expand_as(hidden)) # For newest version of PyTorch (as of 8/25) use this: # hidden = torch.div(hidden, norms.unsqueeze(1).expand_as(hidden)) if noise and self.noise_radius > 0: gauss_noise = torch.normal(means=torch.zeros(hidden.size()), std=self.noise_radius) hidden = hidden + to_gpu(self.gpu, Variable(gauss_noise)) return hidden
def forward(self, X, X_mask): #X: [m, Tx] m = batch size, Tx = word count #print(X.size(), type(X)) m = X.size()[0] Tx = X.size()[1] X = self.embedding(X) #X: [m, Tx, embedding_dim] m = batch size, Tx = word count #print(X.size(), type(X.data)) assert X.size() == torch.Size([m, Tx, self.embedding_dim]) #average words in doc. use mask so we average only words not padding X = torch.sum(X, 1) X = Variable(torch.div(X.data, X_mask)) #X: [m, emb_dim] #print(X.size(), type(X.data)) assert X.size() == torch.Size([m, self.embedding_dim]) X = self.linear(X) #X: [m, 1] #print(X.size(), type(X)) if self.num_classes == 2: assert X.size() == torch.Size([m, 1]) else: assert X.size() == torch.Size([m, self.num_classes]) if self.num_classes == 2: X = torch.squeeze(X) X = self.sigmoid(X) #X: [m] #print(X.size(), type(X)) assert X.size() == torch.Size([m]) return X else: return F.softmax(X)
def backward(ctx, grad_output): """ In the backward pass we receive a Tensor containing the gradient of the loss with respect to the output, and we need to compute the gradient of the loss with respect to the input. """ true_binary, rule_masks, input_logits = ctx.saved_tensors b = F.torch.max(input_logits, 2, keepdim=True)[0] raw_logits = input_logits - b exp_pred = torch.exp(raw_logits) * rule_masks + cmd_args.prob_fix norm = torch.sum(exp_pred, 2, keepdim=True) prob = torch.div(exp_pred, norm) grad_matrix1 = grad_matrix2 = None grad_matrix3 = prob - true_binary rescale = 1.0 if not cmd_args.old_loss: rescale = 1.0 / true_binary.size()[1] grad_matrix3 = grad_matrix3 * rule_masks * grad_output.data * rescale return grad_matrix1, grad_matrix2, Variable(grad_matrix3)
def forward(ctx, true_binary, rule_masks, input_logits): ctx.save_for_backward(true_binary, rule_masks, input_logits) b = F.torch.max(input_logits, 2, keepdim=True)[0] raw_logits = input_logits - b exp_pred = torch.exp(raw_logits) * rule_masks + cmd_args.prob_fix norm = torch.sum(exp_pred, 2, keepdim=True) prob = torch.div(exp_pred, norm) ll = F.torch.abs(F.torch.sum( true_binary * prob, 2)) mask = 1 - rule_masks[:, :, -1] logll = mask * F.torch.log(ll) if cmd_args.old_loss: nnz = torch.sum(mask) loss = -torch.sum(logll) / nnz else: loss = -torch.sum(logll) / true_binary.size()[1] if input_logits.is_cuda: return torch.Tensor([loss]).cuda() else: return torch.Tensor([loss])
def forward(self, inputs): # inputs text_raw_indices, aspect_indices, x_l, x_r = inputs[0], inputs[1], inputs[2], inputs[3] memory_len = torch.sum(text_raw_indices != 0, dim = -1) aspect_len = torch.sum(aspect_indices != 0, dim = -1) # aspect representation nonzeros_aspect = torch.tensor(aspect_len, dtype=torch.float).to(self.opt.device) aspect = self.embed(aspect_indices) aspect = torch.sum(aspect, dim=1) aspect = torch.div(aspect, nonzeros_aspect.view(nonzeros_aspect.size(0), 1)) x = aspect.unsqueeze(dim=1) # memory module memory = self.embed(text_raw_indices) # n x d memory = self.squeeze_embedding(memory, memory_len) # 默认是 batch_first # sentence representation nonzeros_memory = torch.tensor(memory_len, dtype=torch.float).to(self.opt.device) v_s = torch.sum(memory, dim = 1) v_s = torch.div(v_s, nonzeros_memory.view(nonzeros_memory.size(0),1)) v_s = v_s.unsqueeze(dim=1) # position attention module if type == 'c': memory = self.locationed_memory(memory, memory_len, left_len, aspect_len) elif type == 'cabasc': # context attention memory = self.context_attention(x_l, x_r, memory, memory_len, aspect_len) # recalculate sentence rep with new memory v_s = torch.sum(memory, dim = 1) v_s = torch.div(v_s, nonzeros_memory.view(nonzeros_memory.size(0),1)) v_s = v_s.unsqueeze(dim=1) # content attention module for _ in range(self.opt.hops): # x = self.x_linear(x) v_ts = self.attention(memory, x) # classifier v_ns = v_ts + v_s # embedd the sentence v_ns = v_ns.view(v_ns.size(0), -1) v_ms = F.tanh(self.mlp(v_ns)) out = self.dense(v_ms) out = F.softmax(out, dim=-1) return out
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 if group['weight_decay'] != 0: grad = grad.add(group['weight_decay'], p.data) if state['step'] > 1: prev_bias_correction1 = 1 - beta1 ** (state['step'] - 1) prev_bias_correction2 = 1 - beta2 ** (state['step'] - 1) # Hypergradient for Adam: h = torch.dot(grad.view(-1), torch.div(exp_avg, exp_avg_sq.sqrt().add_(group['eps'])).view(-1)) * math.sqrt(prev_bias_correction2) / prev_bias_correction1 # Hypergradient descent of the learning rate: tmp = group['hypergrad_lr'] * h group['lr'] += tmp.double().cpu() # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) return loss
def updateGradInput(self, input, gradOutput): if self.gradInput is None: return if self._div is None: self._div = input.new() if self._output is None: self._output = self.output.new() if self._gradOutput is None: self._gradOutput = input.new() if self._expand3 is None: self._expand3 = input.new() if not self.fastBackward: self.updateOutput(input) inputSize, outputSize = self.weight.size(0), self.weight.size(1) """ dy_j -2 * (w_j - x) x - w_j ---- = ---------------- = ------- dx 2 || w_j - x || y_j """ # to prevent div by zero (NaN) bugs self._output.resize_as_(self.output).copy_(self.output).add_(0.0000001) self._view(self._gradOutput, gradOutput, gradOutput.size()) torch.div(gradOutput, self._output, out=self._div) assert input.dim() == 2 batchSize = input.size(0) self._div.resize_(batchSize, 1, outputSize) self._expand3 = self._div.expand(batchSize, inputSize, outputSize) if torch.typename(input) == 'torch.cuda.FloatTensor': self._repeat2.resize_as_(self._expand3).copy_(self._expand3) self._repeat2.mul_(self._repeat) else: torch.mul(self._repeat, self._expand3, out=self._repeat2) torch.sum(self._repeat2, 2, True, out=self.gradInput) self.gradInput.resize_as_(input) return self.gradInput
def sample(self, fc_feats, att_feats, opt={}): sample_max = opt.get('sample_max', 1) beam_size = opt.get('beam_size', 1) temperature = opt.get('temperature', 1.0) if beam_size > 1: return self.sample_beam(fc_feats, att_feats, opt) batch_size = fc_feats.size(0) state = self.init_hidden(batch_size) # embed fc and att feats fc_feats = self.fc_embed(fc_feats) _att_feats = self.att_embed(att_feats.view(-1, self.att_feat_size)) att_feats = _att_feats.view(*(att_feats.size()[:-1] + (self.rnn_size,))) # Project the attention feats first to reduce memory and computation comsumptions. p_att_feats = self.ctx2att(att_feats.view(-1, self.rnn_size)) p_att_feats = p_att_feats.view(*(att_feats.size()[:-1] + (self.att_hid_size,))) seq = [] seqLogprobs = [] for t in range(self.seq_length + 1): if t == 0: # input <bos> it = fc_feats.data.new(batch_size).long().zero_() elif sample_max: sampleLogprobs, it = torch.max(logprobs.data, 1) it = it.view(-1).long() else: if temperature == 1.0: prob_prev = torch.exp(logprobs.data).cpu() # fetch prev distribution: shape Nx(M+1) else: # scale logprobs by temperature prob_prev = torch.exp(torch.div(logprobs.data, temperature)).cpu() it = torch.multinomial(prob_prev, 1).cuda() sampleLogprobs = logprobs.gather(1, Variable(it, requires_grad=False)) # gather the logprobs at sampled positions it = it.view(-1).long() # and flatten indices for downstream processing xt = self.embed(Variable(it, requires_grad=False)) if t >= 1: # stop when all finished if t == 1: unfinished = it > 0 else: unfinished = unfinished * (it > 0) if unfinished.sum() == 0: break it = it * unfinished.type_as(it) seq.append(it) #seq[t] the input of t+2 time step seqLogprobs.append(sampleLogprobs.view(-1)) output, state = self.core(xt, fc_feats, att_feats, p_att_feats, state) logprobs = F.log_softmax(self.logit(output)) return torch.cat([_.unsqueeze(1) for _ in seq], 1), torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1)
def forward(ctx, input1, input2, y, margin, size_average): ctx.margin = margin ctx.size_average = size_average ctx.w1 = input1.new() ctx.w22 = input1.new() ctx.w = input1.new() ctx.w32 = input1.new() ctx._outputs = input1.new() _idx = input1.new().byte() buffer = torch.mul(input1, input2) torch.sum(buffer, 1, out=ctx.w1, keepdim=True) epsilon = 1e-12 torch.mul(input1, input1, out=buffer) torch.sum(buffer, 1, out=ctx.w22, keepdim=True).add_(epsilon) ctx._outputs.resize_as_(ctx.w22).fill_(1) torch.div(ctx._outputs, ctx.w22, out=ctx.w22) ctx.w.resize_as_(ctx.w22).copy_(ctx.w22) torch.mul(input2, input2, out=buffer) torch.sum(buffer, 1, out=ctx.w32, keepdim=True).add_(epsilon) torch.div(ctx._outputs, ctx.w32, out=ctx.w32) ctx.w.mul_(ctx.w32) ctx.w.sqrt_() torch.mul(ctx.w1, ctx.w, out=ctx._outputs) ctx._outputs = ctx._outputs.select(1, 0) torch.eq(y, -1, out=_idx) ctx._outputs[_idx] = ctx._outputs[_idx].add_(-ctx.margin).clamp_(min=0) torch.eq(y, 1, out=_idx) ctx._outputs[_idx] = ctx._outputs[_idx].mul_(-1).add_(1) output = ctx._outputs.sum() if ctx.size_average: output = output / y.size(0) ctx.save_for_backward(input1, input2, y) return input1.new((output,))
def mask_probabilities(probs, bin_,bins,bins_num): mask_words = bins[bin_] mask_words = list(set(mask_words)) divided_probs = torch.div(probs, bins_num) numpy_divided_probs = divided_probs.cpu().data.numpy() numpy_probs = probs.cpu().data.numpy() numpy_probs[:,mask_words] = numpy_divided_probs[:,mask_words] probs.data = torch.FloatTensor(numpy_probs).cuda() return probs
def log_softmax(unnormalized_probs, bin_,bins,bins_num): # col softmax denom = torch.sum(unnormalized_probs.exp(), 1) # denom is a 200 * 1 tensor denom = (denom.expand(unnormalized_probs.size(1),denom.size(0))).permute(1,0).contiguous() probs = torch.div(unnormalized_probs.exp(), denom) if bins_num >= 2: probs=mask_probabilities(probs, bin_,bins,bins_num) log_probs = torch.log(probs) return log_probs # output is a n * vocab tensor
def forward(self, input1, input2, y): self.w1 = input1.new() self.w22 = input1.new() self.w = input1.new() self.w32 = input1.new() self._outputs = input1.new() _idx = input1.new().byte() buffer = torch.mul(input1, input2) torch.sum(buffer, 1, out=self.w1, keepdim=True) epsilon = 1e-12 torch.mul(input1, input1, out=buffer) torch.sum(buffer, 1, out=self.w22, keepdim=True).add_(epsilon) self._outputs.resize_as_(self.w22).fill_(1) torch.div(self._outputs, self.w22, out=self.w22) self.w.resize_as_(self.w22).copy_(self.w22) torch.mul(input2, input2, out=buffer) torch.sum(buffer, 1, out=self.w32, keepdim=True).add_(epsilon) torch.div(self._outputs, self.w32, out=self.w32) self.w.mul_(self.w32) self.w.sqrt_() torch.mul(self.w1, self.w, out=self._outputs) self._outputs = self._outputs.select(1, 0) torch.eq(y, -1, out=_idx) self._outputs[_idx] = self._outputs[_idx].add_(-self.margin).clamp_(min=0) torch.eq(y, 1, out=_idx) self._outputs[_idx] = self._outputs[_idx].mul_(-1).add_(1) output = self._outputs.sum() if self.size_average: output = output / y.size(0) self.save_for_backward(input1, input2, y) return input1.new((output,))
def mean_dist(source_points,warped_points,L_pck): # compute precentage of correct keypoints batch_size=source_points.size(0) dist=torch.zeros((batch_size)) for i in range(batch_size): p_src = source_points[i,:] p_wrp = warped_points[i,:] N_pts = torch.sum(torch.ne(p_src[0,:],-1)*torch.ne(p_src[1,:],-1)) point_distance = torch.pow(torch.sum(torch.pow(p_src[:,:N_pts]-p_wrp[:,:N_pts],2),0),0.5) L_pck_mat = L_pck[i].expand_as(point_distance) dist[i]=torch.mean(torch.div(point_distance,L_pck_mat)) return dist
def _compute_loss(self, batch, output, target, copy_attn, align): """Compute the loss. The args must match :func:`self._make_shard_state()`. Args: batch: the current batch. output: the predict output from the model. target: the validate target to compare output with. copy_attn: the copy attention value. align: the align info. """ target = target.view(-1) align = align.view(-1) scores = self.generator( self._bottle(output), self._bottle(copy_attn), batch.src_map ) loss = self.criterion(scores, align, target) # this block does not depend on the loss value computed above # and is used only for stats scores_data = collapse_copy_scores( self._unbottle(scores.clone(), batch.batch_size), batch, self.tgt_vocab, batch.dataset.src_vocabs) scores_data = self._bottle(scores_data) # this block does not depend on the loss value computed above # and is used only for stats # Correct target copy token instead of <unk> # tgt[i] = align[i] + len(tgt_vocab) # for i such that tgt[i] == 0 and align[i] != 0 target_data = target.clone() unk = self.criterion.unk_index correct_mask = (target_data == unk) & (align != unk) offset_align = align[correct_mask] + len(self.tgt_vocab) target_data[correct_mask] += offset_align # Compute sum of perplexities for stats stats = self._stats(loss.sum().clone(), scores_data, target_data) # this part looks like it belongs in CopyGeneratorLoss if self.normalize_by_length: # Compute Loss as NLL divided by seq length tgt_lens = batch.tgt[:, :, 0].ne(self.padding_idx).sum(0).float() # Compute Total Loss per sequence in batch loss = loss.view(-1, batch.batch_size).sum(0) # Divide by length of each sequence and sum loss = torch.div(loss, tgt_lens).sum() else: loss = loss.sum() return loss, stats
def compute_accuracy(self, prob_cls, gt_cls): #we only need the detection which >= 0 prob_cls = torch.squeeze(prob_cls) mask = torch.ge(gt_cls, 0) #get valid element valid_gt_cls = torch.masked_select(gt_cls, mask) valid_prob_cls = torch.masked_select(prob_cls, mask) size = min(valid_gt_cls.size()[0], valid_prob_cls.size()[0]) prob_ones = torch.ge(valid_prob_cls, 0.6).float() right_ones = torch.eq(prob_ones, valid_gt_cls.float()).float() return torch.div(torch.mul(torch.sum(right_ones), float(1.0)), float(size))
def forward(self, theta_aff, theta_aff_tps, matches,return_outliers=False): batch_size=theta_aff.size()[0] mask = self.compGeometricTnf(image_batch=expand_dim(self.mask_id,0,batch_size), theta_aff=theta_aff, theta_aff_tps=theta_aff_tps) if return_outliers: mask_outliers = self.compGeometricTnf(image_batch=expand_dim(1.0-self.mask_id,0,batch_size), theta_aff=theta_aff, theta_aff_tps=theta_aff_tps) if self.normalize: epsilon=1e-5 mask = torch.div(mask, torch.sum(torch.sum(torch.sum(mask+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask)) if return_outliers: mask_outliers = torch.div(mask, torch.sum(torch.sum(torch.sum(mask_outliers+epsilon,3),2),1).unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(mask_outliers)) score = torch.sum(torch.sum(torch.sum(torch.mul(mask,matches),3),2),1) if return_outliers: score_outliers = torch.sum(torch.sum(torch.sum(torch.mul(mask_outliers,matches),3),2),1) return (score,score_outliers) return score
def forward(ctx, true_binary, rule_masks, input_logits): ctx.save_for_backward(true_binary, rule_masks, input_logits) b = F.torch.max(input_logits, 2, keepdim=True)[0] raw_logits = input_logits - b exp_pred = torch.exp(raw_logits) * rule_masks norm = torch.sum(exp_pred, 2, keepdim=True) prob = torch.div(exp_pred, norm) loss = F.binary_cross_entropy(prob, true_binary) return loss
def forward(self, text, video, ind, conf=True): text_embd = {} for i, l in enumerate(self.video_GU): video[self.m[i]] = l(video[self.m[i]]) for i, l in enumerate(self.text_GU): text_embd[self.m[i]] = l(text) #MOE weights computation + normalization ------------ moe_weights = self.moe_fc(text) moe_weights = F.softmax(moe_weights, dim=1) available_m = np.zeros(moe_weights.size()) i = 0 for m in video: available_m[:,i] = ind[m] i += 1 available_m = th.from_numpy(available_m).float() available_m = Variable(available_m.cuda()) moe_weights = available_m*moe_weights norm_weights = th.sum(moe_weights, dim=1) norm_weights = norm_weights.unsqueeze(1) moe_weights = th.div(moe_weights, norm_weights) #MOE weights computation + normalization ------ DONE if conf: conf_matrix = Variable(th.zeros(len(text),len(text)).cuda()) i = 0 for m in video: video[m] = video[m].transpose(0,1) conf_matrix += moe_weights[:,i:i+1]*th.matmul(text_embd[m], video[m]) i += 1 return conf_matrix else: i = 0 scores = Variable(th.zeros(len(text)).cuda()) for m in video: text_embd[m] = moe_weights[:,i:i+1]*text_embd[m]*video[m] scores += th.sum(text_embd[m], dim=-1) i += 1 return scores
def sample_with_temperature(logits, sampling_temp, keep_topk): """Select next tokens randomly from the top k possible next tokens. Samples from a categorical distribution over the ``keep_topk`` words using the category probabilities ``logits / sampling_temp``. Args: logits (FloatTensor): Shaped ``(batch_size, vocab_size)``. These can be logits (``(-inf, inf)``) or log-probs (``(-inf, 0]``). (The distribution actually uses the log-probabilities ``logits - logits.logsumexp(-1)``, which equals the logits if they are log-probabilities summing to 1.) sampling_temp (float): Used to scale down logits. The higher the value, the more likely it is that a non-max word will be sampled. keep_topk (int): This many words could potentially be chosen. The other logits are set to have probability 0. Returns: (LongTensor, FloatTensor): * topk_ids: Shaped ``(batch_size, 1)``. These are the sampled word indices in the output vocab. * topk_scores: Shaped ``(batch_size, 1)``. These are essentially ``(logits / sampling_temp)[topk_ids]``. """ if sampling_temp == 0.0 or keep_topk == 1: # For temp=0.0, take the argmax to avoid divide-by-zero errors. # keep_topk=1 is also equivalent to argmax. topk_scores, topk_ids = logits.topk(1, dim=-1) if sampling_temp > 0: topk_scores /= sampling_temp else: logits = torch.div(logits, sampling_temp) if keep_topk > 0: top_values, top_indices = torch.topk(logits, keep_topk, dim=1) kth_best = top_values[:, -1].view([-1, 1]) kth_best = kth_best.repeat([1, logits.shape[1]]).float() # Set all logits that are not in the top-k to -10000. # This puts the probabilities close to 0. ignore = torch.lt(logits, kth_best) logits = logits.masked_fill(ignore, -10000) dist = torch.distributions.Multinomial( logits=logits, total_count=1) topk_ids = torch.argmax(dist.sample(), dim=1, keepdim=True) topk_scores = logits.gather(dim=1, index=topk_ids) return topk_ids, topk_scores
def l2_norm(input): """ input: feature that need to normalize. output: normalziaed feature. """ input_size = input.size() buffer = torch.pow(input, 2) normp = torch.sum(buffer, 1).add_(1e-10) norm = torch.sqrt(normp) _output = torch.div(input, norm.view(-1, 1).expand_as(input)) output = _output.view(input_size) return output
def rescaleCharacter(c): cc = torch.cat(c, 0) m = cc.min(0)[0] s = (cc.max(0)[0] - m).float() for i in range(len(c)): c[i] = ( torch.div( (c[i] - m.expand_as( c[i])).float(), s.expand_as( c[i])) * 255.99).byte() return c
def updateOutput(self, input): assert input.dim() == 2 input_size = input.size() if self._output is None: self._output = input.new() if self.norm is None: self.norm = input.new() if self.buffer is None: self.buffer = input.new() self._output.resize_as_(input) # specialization for the infinity norm if self.p == float('inf'): if not self._indices: self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \ else torch.LongTensor() torch.abs(input, out=self.buffer) torch.max(self._indices, self.buffer, 1, out=self.norm, keepdim=True) self.norm.add_(self.eps) else: if self.normp is None: self.normp = input.new() if self.p % 2 != 0: torch.abs(input, out=self.buffer).pow_(self.p) else: torch.pow(input, self.p, out=self.buffer) torch.sum(self.buffer, 1, out=self.normp, keepdim=True).add_(self.eps) torch.pow(self.normp, 1. / self.p, out=self.norm) torch.div(input, self.norm.view(-1, 1).expand_as(input), out=self._output) self.output = self._output.view(input_size) return self.output
def forward(self, input_n, hidden, phi, nh): self.batch_size = input_n.size()[0] hidden = torch.cat((hidden, input_n), 2) # Aggregate reresentations h_conv = torch.div(torch.bmm(phi, hidden), nh) hidden = hidden.view(-1, self.hidden_size + self.input_size) h_conv = h_conv.view(-1, self.hidden_size + self.input_size) # h_conv has shape (batch_size, n, hidden_size + input_size) m1 = (torch.mm(hidden, self.W1) .view(self.batch_size, -1, self.hidden_size)) m2 = (torch.mm(h_conv, self.W2) .view(self.batch_size, -1, self.hidden_size)) m3 = self.b.unsqueeze(0).unsqueeze(1).expand_as(m2) hidden = torch.sigmoid(m1 + m2 + m3) return hidden
def forward(self, inputs): text_raw_indices, aspect_indices = inputs[0], inputs[1] text_raw_len = torch.sum(text_raw_indices != 0, dim=-1) aspect_len = torch.sum(aspect_indices != 0, dim=-1) context = self.embed(text_raw_indices) aspect = self.embed(aspect_indices) context, (_, _) = self.lstm_context(context, text_raw_len) aspect, (_, _) = self.lstm_aspect(aspect, aspect_len) aspect_len = torch.tensor(aspect_len, dtype=torch.float).to(self.opt.device) aspect = torch.sum(aspect, dim=1) aspect = torch.div(aspect, aspect_len.view(aspect_len.size(0), 1)) text_raw_len = torch.tensor(text_raw_len, dtype=torch.float).to(self.opt.device) context = torch.sum(context, dim=1) context = torch.div(context, text_raw_len.view(text_raw_len.size(0), 1)) aspect_final = self.attention_aspect(aspect, context).squeeze(dim=1) context_final = self.attention_context(context, aspect).squeeze(dim=1) x = torch.cat((aspect_final, context_final), dim=-1) out = self.dense(x) return out
def sum_normalize(cs, axis=TensorAxis.C): reduce_sum = torch.sum(cs, dim=axis, keepdim=True) cs_normalize = torch.div(cs, reduce_sum) return cs_normalize
def l2norm(X, dim, eps=1e-8): """L2-normalize columns of X """ norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps X = torch.div(X, norm) return X
def sinkhorn_stabilized(a, b, C, reg=1e-1, maxIter=1000, tau=1e3, stopThr=1e-9, verbose=False, log=False, warm_start=None, eval_freq=10, print_freq=200, **kwargs): """ Solve the entropic regularization OT problem with log stabilization The function solves the following optimization problem: .. math:: \gamma = arg\min_\gamma <\gamma,C>_F + reg\cdot\Omega(\gamma) s.t. \gamma 1 = a \gamma^T 1= b \gamma\geq 0 where : - C is the (ns,nt) metric cost matrix - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - a and b are target and source measures (sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [1] but with the log stabilization proposed in [3] an defined in [2] (Algo 3.1) Parameters ---------- a : torch.tensor (na,) samples measure in the target domain b : torch.tensor (nb,) samples in the source domain C : torch.tensor (na,nb) loss matrix reg : float Regularization term > 0 tau : float thershold for max value in u or v for log scaling maxIter : int, optional Max number of iterations stopThr : float, optional Stop threshol on error ( > 0 ) verbose : bool, optional Print information along iterations log : bool, optional record log if True Returns ------- gamma : (na x nb) torch.tensor Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters References ---------- [1] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 [2] Bernhard Schmitzer. Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. SIAM Journal on Scientific Computing, 2019 [3] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816. See Also -------- """ device = a.device na, nb = C.shape assert na >= 1 and nb >= 1, 'C needs to be 2d' assert na == a.shape[0] and nb == b.shape[ 0], "Shape of a or b does't match that of C" assert reg > 0, 'reg should be greater than 0' assert a.min() >= 0. and b.min() >= 0., 'Elements in a or b less than 0' if log: log = {'err': []} if warm_start is not None: alpha = warm_start['alpha'] beta = warm_start['beta'] else: alpha = torch.zeros(na, dtype=a.dtype).to(device) beta = torch.zeros(nb, dtype=b.dtype).to(device) u = torch.ones(na, dtype=a.dtype).to(device) / na v = torch.ones(nb, dtype=b.dtype).to(device) / nb def update_K(alpha, beta): """log space computation""" """memory efficient""" torch.add(alpha.reshape(-1, 1), beta.reshape(1, -1), out=K) torch.add(K, -C, out=K) torch.div(K, reg, out=K) torch.exp(K, out=K) def update_P(alpha, beta, u, v, ab_updated=False): """log space P (gamma) computation""" torch.add(alpha.reshape(-1, 1), beta.reshape(1, -1), out=P) torch.add(P, -C, out=P) torch.div(P, reg, out=P) if not ab_updated: torch.add(P, torch.log(u + M_EPS).reshape(-1, 1), out=P) torch.add(P, torch.log(v + M_EPS).reshape(1, -1), out=P) torch.exp(P, out=P) K = torch.empty(C.shape, dtype=C.dtype).to(device) update_K(alpha, beta) b_hat = torch.empty(b.shape, dtype=C.dtype).to(device) it = 1 err = 1 ab_updated = False # allocate memory beforehand KTu = torch.empty(v.shape, dtype=v.dtype).to(device) Kv = torch.empty(u.shape, dtype=u.dtype).to(device) P = torch.empty(C.shape, dtype=C.dtype).to(device) while (err > stopThr and it <= maxIter): upre, vpre = u, v torch.matmul(u, K, out=KTu) v = torch.div(b, KTu + M_EPS) torch.matmul(K, v, out=Kv) u = torch.div(a, Kv + M_EPS) ab_updated = False # remove numerical problems and store them in K if u.abs().sum() > tau or v.abs().sum() > tau: alpha += reg * torch.log(u + M_EPS) beta += reg * torch.log(v + M_EPS) u.fill_(1. / na) v.fill_(1. / nb) update_K(alpha, beta) ab_updated = True if log and it % eval_freq == 0: # we can speed up the process by checking for the error only all # the eval_freq iterations update_P(alpha, beta, u, v, ab_updated) b_hat = torch.sum(P, 0) err = (b - b_hat).pow(2).sum().item() log['err'].append(err) if verbose and it % print_freq == 0: print('iteration {:5d}, constraint error {:5e}'.format(it, err)) it += 1 if log: log['u'] = u log['v'] = v log['alpha'] = alpha + reg * torch.log(u + M_EPS) log['beta'] = beta + reg * torch.log(v + M_EPS) # transport plan update_P(alpha, beta, u, v, False) if log: return P, log else: return P
def similarity(vec, mat, eps=1e-6): vec_norm = torch.norm(vec, 2, 1) mat_norm = torch.norm(mat, 2, 2) normalized_vec = torch.div(vec, vec_norm.expand_as(vec).clamp(min=eps)) normalized_mat = torch.div(mat, mat_norm.expand_as(mat).clamp(min=eps)) return torch.bmm(normalized_mat, normalized_vec.unsqueeze(2)).squeeze(2)
def l2norm(x, dim=-1): norm = torch.pow(x, 2).sum(dim=dim, keepdim=True).sqrt() x_norm = torch.div(x, norm) return x_norm, norm
def test_precedence_semantics(self): """Test semantics for __torch_function__ for functions that take multiple arguments For functions that take multiple arguments, the appropriate __torch_function__ implementation to call is determined by examining the types of the arguments. The precedence order is left-to-right in the argument list, except subclasses are always checked before superclasses. The first result of calling the implementations in precedence order that is not NotImplemented is returned to the user. If all implementations return NotImplemented, a TypeError is raised. All cases are tested with functions implemented in C++ and either foo or baz, which are python functions defined above that are instrumented to obey the same dispatch rules as the functions in torch.functional. """ # DiagonalTensor has a valid override and SubDiagonal has an # override that returns NotImplemented so we should call the # DiagonalTensor implementation, returning -1 t1 = DiagonalTensor(5, 2) t2 = SubDiagonalTensor(5, 2) self.assertEqual(torch.div(t1, t2), -1) self.assertEqual(torch.div(t2, t1), -1) self.assertEqual(foo(t1, t2), -1) self.assertEqual(foo(t2, t1), -1) # SubTensor has an implementation that returns NotImplemented as # well so it should behave exactly like SubDiagonalTensor in the # test above t3 = SubTensor([[1, 2], [1, 2]]) self.assertEqual(torch.div(t1, t3), -1) self.assertEqual(torch.div(t3, t1), -1) self.assertEqual(foo(t1, t3), -1) self.assertEqual(foo(t3, t1), -1) # div between SubTensor and SubDiagonalTensor should raise # TypeError since both have an implementation that # explicitly returns NotImplemented with self.assertRaises(TypeError): torch.div(t2, t3) with self.assertRaises(TypeError): torch.div(t3, t2) with self.assertRaises(TypeError): foo(t2, t3) with self.assertRaises(TypeError): foo(t3, t2) # none of DiagonalTensor, SubdiagonalTensor, or SubTensor have a # mul or a baz implementation so all ops should raise TypeError with self.assertRaises(TypeError): torch.mul(t1, t1) with self.assertRaises(TypeError): torch.mul(t1, t2) with self.assertRaises(TypeError): torch.mul(t1, t3) with self.assertRaises(TypeError): torch.mul(t2, t1) with self.assertRaises(TypeError): torch.mul(t2, t2) with self.assertRaises(TypeError): torch.mul(t2, t3) with self.assertRaises(TypeError): torch.mul(t3, t1) with self.assertRaises(TypeError): torch.mul(t3, t2) with self.assertRaises(TypeError): torch.mul(t3, t3) with self.assertRaises(TypeError): baz(t1, t1) with self.assertRaises(TypeError): baz(t1, t2) with self.assertRaises(TypeError): baz(t1, t3) with self.assertRaises(TypeError): baz(t2, t1) with self.assertRaises(TypeError): baz(t2, t2) with self.assertRaises(TypeError): baz(t2, t3) with self.assertRaises(TypeError): baz(t3, t1) with self.assertRaises(TypeError): baz(t3, t2) with self.assertRaises(TypeError): baz(t3, t3)
def GAN_pretrain(model, GAN_model, criterion, optimizer, pos_feats, maxiter): model.eval() GAN_model.train() GAN_mask_batch_size = opts['GAN_mask_batch_size'] # -------------Evaluate mask------------- # print('Evaluating Mask') n = pos_feats.size(0) nBatches = int(round(float(n)/GAN_mask_batch_size)) prob_k = torch.zeros(9, 1) for k in range(0, 9): row = int(math.floor(k/3)) col = k % 3 for i in range(1, nBatches+1): # prepare batch batch = pos_feats[GAN_mask_batch_size*(i-1):min(pos_feats.size(0), GAN_mask_batch_size*i), :].data.clone() batch = batch.view(-1, 512, 3, 3) batch[:, :, col, row] = 0 batch = batch.view(batch.size(0), -1) # prepare label feat = model(batch, in_layer='fc4') if i == 1: feats = feat.data.clone() else: feats = torch.cat((feats, feat.data.clone()), 0) X = feats X_max = torch.max(feats, dim=1)[0] X_max = X_max.repeat(2, 1).permute(1, 0) E = torch.exp(feats-X_max) L = torch.sum(E, 1) Y = torch.div(E, L.repeat(2, 1).permute(1, 0)) prob_k[k] = torch.sum(Y, dim=0)[0] # print('mask {}, value: {:.3f}'.format(k, prob_k[k][0])) _, idx = torch.min(prob_k, 0) row = int(math.floor(idx/3)) col = idx % 3 # -------------GAN------------------ GAN_model.train() GAN_batch_size = opts['GAN_batch_size'] nBatches = int(round(float(n)/GAN_batch_size)) objective = torch.zeros(1, maxiter) # prepare batch data pos_idx = np.random.permutation(pos_feats.size(0)) while(len(pos_idx) < GAN_batch_size*maxiter): pos_idx = np.concatenate([pos_idx, np.random.permutation(pos_feats.size(0))]) pos_pointer = 0 # iter for iter in range(maxiter): tic = time.time() # select pos idx pos_next = pos_pointer + GAN_batch_size pos_cur_idx = pos_idx[pos_pointer:pos_next] pos_cur_idx = pos_feats.new(pos_cur_idx).long() pos_pointer = pos_next # create batch batch_pos_feats = Variable(pos_feats.index_select(0, pos_cur_idx)) labels = torch.ones(3, 3, 1, GAN_batch_size) labels[col, row, :] = 0 if opts['use_gpu']: batch_pos_feats = batch_pos_feats.cuda() labels = labels.cuda() GAN_score = GAN_model(batch_pos_feats).view(3, 3, 1, GAN_batch_size) # optimize loss = criterion(GAN_score, labels) GAN_model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(GAN_model.parameters(), opts['grad_clip']) optimizer.step() # result objective[:, iter] = loss.item() / GAN_batch_size # objective[iter] = print "Pretrain GAN: Iter %d, Loss %.4f, Time %.3f" % (iter+1, torch.mean(objective[:,0:iter+1], dim=1).data, time.time()-tic)
def forward(self, features, labels=None, mask=None): """Compute loss for model. If both `labels` and `mask` are None, it degenerates to SimCLR unsupervised loss: https://arxiv.org/pdf/2002.05709.pdf Args: features: hidden vector of shape [bsz, n_views, ...]. labels: ground truth of shape [bsz]. mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j has the same class as sample i. Can be asymmetric. Returns: A loss scalar. """ device = (torch.device('cuda') if features.is_cuda else torch.device('cpu')) if len(features.shape) < 3: raise ValueError('`features` needs to be [bsz, n_views, ...],' 'at least 3 dimensions are required') if len(features.shape) > 3: features = features.view(features.shape[0], features.shape[1], -1) batch_size = features.shape[0] if labels is not None and mask is not None: raise ValueError('Cannot define both `labels` and `mask`') elif labels is None and mask is None: mask = torch.eye(batch_size, dtype=torch.float32).to(device) elif labels is not None: labels = labels.contiguous().view(-1, 1) if labels.shape[0] != batch_size: raise ValueError( 'Num of labels does not match num of features') mask = torch.eq(labels, labels.T).float().to(device) else: mask = mask.float().to(device) contrast_count = features.shape[1] contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0) if self.contrast_mode == 'one': anchor_feature = features[:, 0] anchor_count = 1 elif self.contrast_mode == 'all': anchor_feature = contrast_feature anchor_count = contrast_count else: raise ValueError('Unknown mode: {}'.format(self.contrast_mode)) # compute logits anchor_dot_contrast = torch.div( torch.matmul(anchor_feature, contrast_feature.T), self.temperature) # for numerical stability logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True) logits = anchor_dot_contrast - logits_max.detach() # tile mask mask = mask.repeat(anchor_count, contrast_count) # mask-out self-contrast cases logits_mask = torch.scatter( torch.ones_like(mask), 1, torch.arange(batch_size * anchor_count).view(-1, 1).to(device), 0) mask = mask * logits_mask # compute log_prob exp_logits = torch.exp(logits) * logits_mask log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True)) # compute mean of log-likelihood over positive mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1) # loss loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos loss = loss.view(anchor_count, batch_size).mean() return loss
def linear_cg( matmul_closure, rhs, n_tridiag=0, tolerance=1e-6, eps=1e-10, max_iter=None, max_tridiag_iter=None, initial_guess=None, preconditioner=None, ): """ Implements the linear conjugate gradients method for (approximately) solving systems of the form lhs result = rhs for positive definite and symmetric matrices. Args: - matmul_closure - a function which performs a left matrix multiplication with lhs_mat - rhs - the right-hand side of the equation - n_tridiag - returns a tridiagonalization of the first n_tridiag columns of rhs - tolerance - stop the solve when the max residual is less than this - eps - noise to add to prevent division by zero - max_iter - the maximum number of CG iterations - max_tridiag_iter - the maximum size of the tridiagonalization matrix - initial_guess - an initial guess at the solution `result` - precondition_closure - a functions which left-preconditions a supplied vector Returns: result - a solution to the system (if n_tridiag is 0) result, tridiags - a solution to the system, and corresponding tridiagonal matrices (if n_tridiag > 0) """ # Unsqueeze, if necesasry is_vector = rhs.ndimension() == 1 if is_vector: rhs = rhs.unsqueeze(-1) # Some default arguments if max_iter is None: max_iter = settings.max_cg_iterations.value() if max_tridiag_iter is None: max_tridiag_iter = settings.max_lanczos_quadrature_iterations.value() if initial_guess is None: initial_guess = torch.zeros_like(rhs) if preconditioner is None: preconditioner = _default_preconditioner precond = False else: precond = True # If we are running m CG iterations, we obviously can't get more than m Lanczos coefficients if max_tridiag_iter > max_iter: raise RuntimeError( "Getting a tridiagonalization larger than the number of CG iterations run is not possible!" ) # Check matmul_closure object if torch.is_tensor(matmul_closure): matmul_closure = matmul_closure.matmul elif not callable(matmul_closure): raise RuntimeError( "matmul_closure must be a tensor, or a callable object!") # Get some constants batch_shape = rhs.shape[:-2] num_rows = rhs.size(-2) n_iter = min(max_iter, num_rows) if settings.terminate_cg_by_size.on() else max_iter n_tridiag_iter = min(max_tridiag_iter, num_rows) # result <- x_{0} result = initial_guess # residual: residual_{0} = b_vec - lhs x_{0} residual = rhs - matmul_closure(result) # Check for NaNs if not torch.equal(residual, residual): raise RuntimeError( "NaNs encounterd when trying to perform matrix-vector multiplication" ) # Sometime we're lucky and the preconditioner solves the system right away residual_norm = residual.norm(2, dim=-2) if (residual_norm < tolerance).all() and not n_tridiag: n_iter = 0 # Skip the iteration! # Otherwise, let's define precond_residual and curr_conjugate_vec else: # precon_residual{0} = M^-1 residual_{0} precond_residual = preconditioner(residual) curr_conjugate_vec = precond_residual residual_inner_prod = precond_residual.mul(residual).sum(-2, keepdim=True) # Define storage matrices mul_storage = torch.empty_like(residual) alpha = torch.empty(*batch_shape, rhs.size(-1), dtype=residual.dtype, device=residual.device) beta = torch.empty_like(alpha) # Define tridiagonal matrices, if applicable if n_tridiag: t_mat = torch.zeros(n_tridiag_iter, n_tridiag_iter, *batch_shape, n_tridiag, dtype=alpha.dtype, device=alpha.device) alpha_reciprocal = torch.empty(*batch_shape, n_tridiag, dtype=t_mat.dtype, device=t_mat.device) prev_alpha_reciprocal = torch.empty_like(alpha_reciprocal) prev_beta = torch.empty_like(alpha_reciprocal) update_tridiag = True last_tridiag_iter = 0 # Start the iteration for k in range(n_iter): # Get next alpha # alpha_{k} = (residual_{k-1}^T precon_residual{k-1}) / (p_vec_{k-1}^T mat p_vec_{k-1}) mvms = matmul_closure(curr_conjugate_vec) if precond: torch.mul(curr_conjugate_vec, mvms, out=mul_storage) torch.sum(mul_storage, -2, keepdim=True, out=alpha) alpha.add_(eps) torch.div(residual_inner_prod, alpha, out=alpha) # Update residual # residual_{k} = residual_{k-1} - alpha_{k} mat p_vec_{k-1} torch.addcmul(residual, -1, alpha, mvms, out=residual) # Update precond_residual # precon_residual{k} = M^-1 residual_{k} precond_residual = preconditioner(residual) _jit_linear_cg_updates( result, alpha, residual_inner_prod, torch.tensor(eps), beta, residual, precond_residual, mul_storage, curr_conjugate_vec, ) else: _jit_linear_cg_updates_no_precond( mvms, result, alpha, residual_inner_prod, torch.tensor(eps), beta, residual, precond_residual, mul_storage, curr_conjugate_vec, ) # Update tridiagonal matrices, if applicable if n_tridiag and k < n_tridiag_iter and update_tridiag: alpha_tridiag = alpha.squeeze_(-2).narrow(-1, 0, n_tridiag) beta_tridiag = beta.squeeze_(-2).narrow(-1, 0, n_tridiag) torch.reciprocal(alpha_tridiag, out=alpha_reciprocal) if k == 0: t_mat[k, k].copy_(alpha_reciprocal) else: torch.addcmul(alpha_reciprocal, prev_beta, prev_alpha_reciprocal, out=t_mat[k, k]) torch.mul(prev_beta.sqrt_(), prev_alpha_reciprocal, out=t_mat[k, k - 1]) t_mat[k - 1, k].copy_(t_mat[k, k - 1]) if t_mat[k - 1, k].max() < 1e-6: update_tridiag = False last_tridiag_iter = k prev_alpha_reciprocal.copy_(alpha_reciprocal) prev_beta.copy_(beta_tridiag) if is_vector: result = result.squeeze(-1) if n_tridiag: t_mat = t_mat[:last_tridiag_iter + 1, :last_tridiag_iter + 1] return result, t_mat.permute(-1, *range(2, 2 + len(batch_shape)), 0, 1).contiguous() else: return result
def _div_aten(a, b): if isinstance(a, (bool, int)): return torch.div(a, b, rounding_mode="trunc") return torch.true_divide(a, b)
def _generate_single_step(self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None): bsz, srclen = src_tokens.size() maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen # the max beam size is the dictionary size - 1, since we never select pad beam_size = beam_size if beam_size is not None else self.beam_size beam_size = min(beam_size, self.vocab_size - 1) encoder_outs = [] incremental_states = {} for model in self.models: if not self.retain_dropout: model.eval() if isinstance(model.decoder, FairseqIncrementalDecoder): incremental_states[model] = {} else: incremental_states[model] = None # compute the encoder output for each beam encoder_out = model.encoder( src_tokens.repeat(1, beam_size).view(-1, srclen), src_lengths.expand( beam_size, src_lengths.numel()).t().contiguous().view(-1), ) encoder_outs.append(encoder_out) # initialize buffers scores = src_tokens.data.new(bsz * beam_size, maxlen + 1).float().fill_(0) scores_buf = scores.clone() tokens = src_tokens.data.new(bsz * beam_size, maxlen + 2).fill_(self.pad) tokens_buf = tokens.clone() tokens[:, 0] = self.eos attn = scores.new(bsz * beam_size, src_tokens.size(1), maxlen + 2) attn_buf = attn.clone() # list of completed sentences finalized = [[] for i in range(bsz)] finished = [False for i in range(bsz)] worst_finalized = [{ 'idx': None, 'score': -math.inf } for i in range(bsz)] num_remaining_sent = bsz # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) # helper function for allocating buffers on the fly buffers = {} def buffer(name, type_of=tokens): # noqa if name not in buffers: buffers[name] = type_of.new() return buffers[name] def is_finished(sent, step, unfinalized_scores=None): """ Check whether we've finished generation for a given sentence, by comparing the worst score among finalized hypotheses to the best possible score among unfinalized hypotheses. """ assert len(finalized[sent]) <= beam_size if len(finalized[sent]) == beam_size: if self.stop_early or step == maxlen or unfinalized_scores is None: return True # stop if the best unfinalized score is worse than the worst # finalized one best_unfinalized_score = unfinalized_scores[sent].max() if self.normalize_scores: best_unfinalized_score /= maxlen**self.len_penalty if worst_finalized[sent]['score'] >= best_unfinalized_score: return True return False def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None): """ Finalize the given hypotheses at this step, while keeping the total number of finalized hypotheses per sentence <= beam_size. Note: the input must be in the desired finalization order, so that hypotheses that appear earlier in the input are preferred to those that appear later. Args: step: current time step bbsz_idx: A vector of indices in the range [0, bsz*beam_size), indicating which hypotheses to finalize eos_scores: A vector of the same size as bbsz_idx containing scores for each hypothesis unfinalized_scores: A vector containing scores for all unfinalized hypotheses """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors tokens_clone = tokens.index_select(0, bbsz_idx) tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step + 2] # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, :step + 1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1)**self.len_penalty cum_unfin = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) sents_seen = set() for i, (idx, score) in enumerate( zip(bbsz_idx.tolist(), eos_scores.tolist())): unfin_idx = idx // beam_size sent = unfin_idx + cum_unfin[unfin_idx] sents_seen.add((sent, unfin_idx)) def get_hypo(): # remove padding tokens from attn scores nonpad_idxs = src_tokens[sent].ne(self.pad) hypo_attn = attn_clone[i][nonpad_idxs] _, alignment = hypo_attn.max(dim=0) return { 'tokens': tokens_clone[i], 'score': score, 'attention': hypo_attn, # src_len x tgt_len 'alignment': alignment, 'positional_scores': pos_scores[i], } if len(finalized[sent]) < beam_size: finalized[sent].append(get_hypo()) elif not self.stop_early and score > worst_finalized[sent][ 'score']: # replace worst hypo for this sentence with new/better one worst_idx = worst_finalized[sent]['idx'] if worst_idx is not None: finalized[sent][worst_idx] = get_hypo() # find new worst finalized hypo for this sentence idx, s = min(enumerate(finalized[sent]), key=lambda r: r[1]['score']) worst_finalized[sent] = { 'score': s['score'], 'idx': idx, } newly_finished = [] for sent, unfin_idx in sents_seen: # check termination conditions for this sentence if not finished[sent] and is_finished(sent, step, unfinalized_scores): finished[sent] = True newly_finished.append(unfin_idx) return newly_finished reorder_state = None batch_idxs = None # print("SHAPE", prefix_tokens.size()[1]+1) if prefix_tokens is not None: num_of_steps = prefix_tokens.size()[1] + 1 print("PREFIX TOKENS NOT NONE") else: print("PREFIX TOKENS NONE") num_of_steps = 1 print("NUM OF STEPS", num_of_steps) for step in range(num_of_steps): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange( batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size) for i, model in enumerate(self.models): if isinstance(model.decoder, FairseqIncrementalDecoder): model.decoder.reorder_incremental_state( incremental_states[model], reorder_state) encoder_outs[i] = model.encoder.reorder_encoder_out( encoder_outs[i], reorder_state) probs, avg_attn_scores = self._decode(tokens[:, :step + 1], encoder_outs, incremental_states) # print(probs,probs.size(),"PROBS, SequenceGenerator") # print(avg_attn_scores, avg_attn_scores.size(), "avg_attn_scores SequenceGenerator") # print(probs.numpy()) c = np.exp(probs.numpy()[:10]) a = (np.argsort(-probs.numpy()[0])) b = [self.tgt_dict.symbols[x] for x in a[:10]] # d = (np.argmax(-probs.numpy()[0])) # print(b,step) if step == num_of_steps - 1: return probs.numpy()[0] # print(d) # raise Exception if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam probs = probs.unfold(0, 1, beam_size).squeeze(2).contiguous() scores = scores.type_as(probs) scores_buf = scores_buf.type_as(probs) elif not self.sampling: # make probs contain cumulative scores for each hypothesis probs.add_(scores[:, step - 1].view(-1, 1)) probs[:, self.pad] = -math.inf # never select pad probs[:, self.unk] -= self.unk_penalty # apply unk penalty # Record attention scores attn[:, :, step + 1].copy_(avg_attn_scores) cand_scores = buffer('cand_scores', type_of=scores) cand_indices = buffer('cand_indices') cand_beams = buffer('cand_beams') eos_bbsz_idx = buffer('eos_bbsz_idx') eos_scores = buffer('eos_scores', type_of=scores) if step < maxlen: if prefix_tokens is not None and step < prefix_tokens.size(1): probs_slice = probs.view(bsz, -1, probs.size(-1))[:, 0, :] cand_scores = torch.gather( probs_slice, dim=1, index=prefix_tokens[:, step].view(-1, 1).data).expand( -1, cand_size) cand_indices = prefix_tokens[:, step].view(-1, 1).expand( bsz, cand_size).data cand_beams.resize_as_(cand_indices).fill_(0) elif self.sampling: assert self.pad == 1, 'sampling assumes the first two symbols can be ignored' if self.sampling_topk > 0: values, indices = probs[:, 2:].topk(self.sampling_topk) exp_probs = values.div_( self.sampling_temperature).exp() if step == 0: torch.multinomial(exp_probs, beam_size, replacement=True, out=cand_indices) else: torch.multinomial(exp_probs, 1, replacement=True, out=cand_indices) torch.gather(exp_probs, dim=1, index=cand_indices, out=cand_scores) torch.gather(indices, dim=1, index=cand_indices, out=cand_indices) cand_indices.add_(2) else: exp_probs = probs.div_( self.sampling_temperature).exp_().view( -1, self.vocab_size) if step == 0: # we exclude the first two vocab items, one of which is pad torch.multinomial(exp_probs[:, 2:], beam_size, replacement=True, out=cand_indices) else: torch.multinomial(exp_probs[:, 2:], 1, replacement=True, out=cand_indices) cand_indices.add_(2) torch.gather(exp_probs, dim=1, index=cand_indices, out=cand_scores) cand_scores.log_() cand_indices = cand_indices.view(bsz, -1).repeat(1, 2) cand_scores = cand_scores.view(bsz, -1).repeat(1, 2) if step == 0: cand_beams = torch.zeros( bsz, cand_size).type_as(cand_indices) else: cand_beams = torch.arange(0, beam_size).repeat( bsz, 2).type_as(cand_indices) # make scores cumulative cand_scores.add_( torch.gather( scores[:, step - 1].view(bsz, beam_size), dim=1, index=cand_beams, )) else: # take the best 2 x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. torch.topk( probs.view(bsz, -1), k=min(cand_size, probs.view(bsz, -1).size(1) - 1), # -1 so we never select pad out=(cand_scores, cand_indices), ) torch.div(cand_indices, self.vocab_size, out=cand_beams) cand_indices.fmod_(self.vocab_size) else: # finalize all active hypotheses once we hit maxlen # pick the hypothesis with the highest prob of EOS right now torch.sort( probs[:, self.eos], descending=True, out=(eos_scores, eos_bbsz_idx), ) num_remaining_sent -= len( finalize_hypos(step, eos_bbsz_idx, eos_scores)) assert num_remaining_sent == 0 break # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos eos_mask = cand_indices.eq(self.eos) finalized_sents = set() if step >= self.minlen: # only consider eos when it's among the top beam_size indices torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_bbsz_idx, ) if eos_bbsz_idx.numel() > 0: torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_scores, ) finalized_sents = finalize_hypos(step, eos_bbsz_idx, eos_scores, cand_scores) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break assert step < maxlen if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones(bsz).type_as(cand_indices) batch_mask[cand_indices.new(finalized_sents)] = 0 batch_idxs = batch_mask.nonzero().squeeze(-1) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) scores_buf.resize_as_(scores) tokens = tokens.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1) tokens_buf.resize_as_(tokens) attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1) attn_buf.resize_as_(attn) bsz = new_bsz else: batch_idxs = None # set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos active_mask = buffer('active_mask') torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[:eos_mask.size(1)], out=active_mask, ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask active_hypos, _ignore = buffer('active_hypos'), buffer('_ignore') torch.topk(active_mask, k=beam_size, dim=1, largest=False, out=(_ignore, active_hypos)) active_bbsz_idx = buffer('active_bbsz_idx') torch.gather( cand_bbsz_idx, dim=1, index=active_hypos, out=active_bbsz_idx, ) active_scores = torch.gather( cand_scores, dim=1, index=active_hypos, out=scores[:, step].view(bsz, beam_size), ) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses torch.index_select( tokens[:, :step + 1], dim=0, index=active_bbsz_idx, out=tokens_buf[:, :step + 1], ) torch.gather( cand_indices, dim=1, index=active_hypos, out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], ) if step > 0: torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx, out=scores_buf[:, :step], ) torch.gather( cand_scores, dim=1, index=active_hypos, out=scores_buf.view(bsz, beam_size, -1)[:, :, step], ) # copy attention for active hypotheses torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx, out=attn_buf[:, :, :step + 2], ) # swap buffers tokens, tokens_buf = tokens_buf, tokens scores, scores_buf = scores_buf, scores attn, attn_buf = attn_buf, attn # reorder incremental state in decoder reorder_state = active_bbsz_idx # print("RESULT") # print(scores[0]) # print([self.tgt_dict.symbols[x] for x in tokens[0]][:5]) # print([self.tgt_dict.symbols[x] for x in tokens[0]][:5]) # print([self.tgt_dict.symbols[x] for x in tokens[1]][:5]) # print([self.tgt_dict.symbols[x] for x in tokens[2]][:5]) # print([x for x in tokens[0]][:5]) # print([self.tgt_dict.symbols[x] for x in tokens[0]][:5]) # print([x for x in tokens[1]][:5]) # print([x for x in tokens[4]][:5]) # print(self.tgt_dict.symbols[1115],self.tgt_dict.symbols[5741]) # ,scores,self.tgt_dict.string(tokens[0]),"try",self.tgt_dict.string([1,62,4])) # sort by score descending for sent in range(len(finalized)): finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True) return finalized
def matrix_poly(matrix, d): x = torch.eye(d).to(device) + torch.div(matrix.to(device), d).to(device) return torch.matrix_power(x, d)
def _record_eta_batchwise(model, X, y, args): epsilon = args.epsilon_attack num_steps = args.num_steps_attack step_size = epsilon * 8 / num_steps smth_avg_steps = args.smth_avg_steps num_avg_steps = args.grad_avg_steps device = args.device print("epsilon is:{}".format(epsilon)) print("num_steps is:{}".format(num_steps)) X_pgd = Variable(X.data, requires_grad=True) if args.random: random_noise = torch.FloatTensor(*X_pgd.shape).normal_( mean=0, std=2 * epsilon).to( device) #.uniform_(-epsilon, epsilon).to(device) random_noise_reshaped = random_noise.view(random_noise.size(0), -1) random_noise_reshaped_norm = torch.norm(random_noise_reshaped, p=2, dim=1, keepdim=True) all_epsilon_vec = (epsilon * torch.ones([ random_noise_reshaped_norm.size(0), random_noise_reshaped_norm.size(1) ])).type_as(random_noise_reshaped_norm) random_noise_reshaped_normzed = epsilon * torch.div( random_noise_reshaped, torch.max(random_noise_reshaped_norm, all_epsilon_vec).expand( -1, random_noise_reshaped.size(1)) + 1e-8) random_noise_final = random_noise_reshaped_normzed.view( X_pgd.size(0), X_pgd.size(1), X_pgd.size(2), X_pgd.size(3)) X_pgd = Variable(X_pgd.data + random_noise_final, requires_grad=True) for _ in range(num_steps): opt = optim.SGD([X_pgd], lr=1e-3) opt.zero_grad() ### Here you add averaging... for avg_ in range(num_avg_steps): noi_z = model(X_pgd, args) soft_z = F.softmax(noi_z, dim=1) if avg_ == 0: soft_z_avg = soft_z else: soft_z_avg = soft_z_avg + soft_z soft_z_avg = soft_z_avg / float(num_avg_steps) logsoftmax = torch.log(soft_z_avg.clamp(min=1e-20)) loss = F.nll_loss(logsoftmax, y) loss.backward() X_pgd_grad = X_pgd.grad.data X_pgd_grad_reshaped = X_pgd_grad.view(X_pgd_grad.size(0), -1) X_pgd_grad_reshaped_norm = torch.norm(X_pgd_grad_reshaped, p=2, dim=1, keepdim=True) X_pgd_grad_reshaped_normzed = torch.div( X_pgd_grad_reshaped, X_pgd_grad_reshaped_norm.expand(-1, X_pgd_grad_reshaped.size(1)) + 1e-8) X_pgd_grad_normzed = X_pgd_grad_reshaped_normzed.view( X_pgd_grad.size(0), X_pgd_grad.size(1), X_pgd_grad.size(2), X_pgd_grad.size(3)) eta = step_size * X_pgd_grad_normzed.data X_pgd = Variable(X_pgd.data + eta, requires_grad=True) eta_tot = X_pgd.data - X.data eta_tot_reshaped = eta_tot.view(eta_tot.size(0), -1) eta_tot_reshaped_norm = torch.norm(eta_tot_reshaped, p=2, dim=1, keepdim=True) all_epsilon_vec = (epsilon * torch.ones( [eta_tot_reshaped_norm.size(0), eta_tot_reshaped_norm.size(1)])).type_as(eta_tot_reshaped_norm) eta_tot_reshaped_normzed = epsilon * torch.div( eta_tot_reshaped, torch.max(eta_tot_reshaped_norm, all_epsilon_vec).expand( -1, eta_tot_reshaped.size(1)) + 1e-8) eta_tot_final = eta_tot_reshaped_normzed.view(X_pgd_grad.size(0), X_pgd_grad.size(1), X_pgd_grad.size(2), X_pgd_grad.size(3)) X_pgd = Variable(torch.clamp(X.data + eta_tot_final.data, 0, 1.0), requires_grad=True) #X_pgd = Variable(torch.clamp(X_pgd, 0, 1.0), requires_grad=True) with torch.no_grad(): for step in range(smth_avg_steps): out = model(X.data, args) out_pgd = model(X_pgd.data, args) if step != 0: cum_counts = cum_counts + (torch.max( out.data, dim=1, keepdim=True)[0].repeat( 1, out.data.size(1)) == out.data).float() cum_counts_pgd = cum_counts_pgd + (torch.max( out_pgd.data, dim=1, keepdim=True)[0].repeat( 1, out_pgd.data.size(1)) == out_pgd.data).float() else: cum_counts = (torch.max( out.data, dim=1, keepdim=True)[0].repeat( 1, out.data.size(1)) == out.data).float() cum_counts_pgd = (torch.max( out_pgd.data, dim=1, keepdim=True)[0].repeat( 1, out_pgd.data.size(1)) == out_pgd.data).float() err = (cum_counts.data.max(1)[1] != y.data).float().sum() err_pgd = (cum_counts_pgd.data.max(1)[1] != y.data).float().sum() eta_final = X_pgd.data - X.data print('err nat: ', err) print('err pgd (white-box): ', err_pgd) return X_pgd.data, err_pgd, eta_final
def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError( 'Adam does not support sparse gradients, please consider SparseAdam instead' ) flag1, flag2 = self._check_shape(grad.size()) new_shape = p.data.size() if flag2 and group['enable_factorization']: new_shape, old_shape =\ self._experimental_reshape(p.data.size()) grad = grad.view(new_shape) state = self.state[p] if len(state) == 0: state['step'] = 0 if group['enable_momentum']: state['exp_avg'] = torch.zeros(new_shape, dtype=torch.float32, device=p.grad.device) if flag1 and group['enable_factorization']: state['exp_avg_sq_R'] = torch.zeros( (1, new_shape[1]), dtype=torch.float32, device=p.grad.device) state['exp_avg_sq_C'] = torch.zeros( (new_shape[0], 1), dtype=torch.float32, device=p.grad.device) else: state['exp_avg_sq'] = torch.zeros(new_shape, dtype=torch.float32, device=p.grad.device) if group['ams_grad']: state['exp_avg_sq_hat'] = torch.zeros( new_shape, dtype=torch.float32, device=p.grad.device) if group['enable_momentum']: exp_avg = state['exp_avg'] if flag1 and group['enable_factorization']: exp_avg_sq_R = state['exp_avg_sq_R'] exp_avg_sq_C = state['exp_avg_sq_C'] else: exp_avg_sq = state['exp_avg_sq'] if group['ams_grad']: exp_avg_sq_hat = state['exp_avg_sq_hat'] state['step'] += 1 lr_t = group['lr'](state['step']) if group['relative_step_size']: lr_t *= max(group['eps2'], self._rms(data)) if group['enable_momentum']: beta1_t = group['beta1'](state['step']) exp_avg.mul_(beta1_t).add_(1 - beta1_t, grad) beta2_t = group['beta2'](state['step']) if flag1 and group['enable_factorization']: exp_avg_sq_R.mul_(beta2_t).add_( 1 - beta2_t, torch.sum(torch.mul(grad, grad).add_(group['eps1']), dim=0, keepdim=True)) exp_avg_sq_C.mul_(beta2_t).add_( 1 - beta2_t, torch.sum(torch.mul(grad, grad).add_(group['eps1']), dim=1, keepdim=True)) v = torch.mul(exp_avg_sq_C, exp_avg_sq_R).div_(torch.sum(exp_avg_sq_R)) else: exp_avg_sq.mul_(beta2_t).addcmul_( 1 - beta2_t, grad, grad).add_( (1 - beta2_t) * group['eps1']) v = exp_avg_sq g = grad if group['enable_momentum']: g = torch.div(exp_avg, 1 - beta1_t**state['step']) if group['ams_grad']: torch.max(exp_avg_sq_hat, v, out=exp_avg_sq_hat) v = exp_avg_sq_hat u = torch.div(g, (torch.div( v, 1 - beta2_t**state['step'])).sqrt().add_( group['eps1'])) else: u = torch.div(g, v.sqrt()) u.div_(max(1, self._rms(u) / group['cliping_threshold'])) p.data.add_(-lr_t * (u.view(old_shape) if flag2 and group['enable_factorization'] else u)) if group['weight_decay'] != 0: p.data.add_(-group['weight_decay'] * lr_t, p.data) return loss
def train_update(model, GAN_model, criterion, GAN_criterion, optimizer, GAN_optimizer, pos_feats, neg_feats, maxiter, in_layer='fc4'): batch_pos = opts['batch_pos'] batch_neg = opts['batch_neg'] batch_test = opts['batch_test'] batch_neg_cand = max(opts['batch_neg_cand'], batch_neg) pos_idx = np.random.permutation(pos_feats.size(0)) neg_idx = np.random.permutation(neg_feats.size(0)) while(len(pos_idx) < batch_pos*maxiter): pos_idx = np.concatenate([pos_idx, np.random.permutation(pos_feats.size(0))]) while(len(neg_idx) < batch_neg_cand*maxiter): neg_idx = np.concatenate([neg_idx, np.random.permutation(neg_feats.size(0))]) pos_pointer = 0 neg_pointer = 0 for iter in range(maxiter): # select pos idx pos_next = pos_pointer+batch_pos pos_cur_idx = pos_idx[pos_pointer:pos_next] pos_cur_idx = pos_feats.new(pos_cur_idx).long() pos_pointer = pos_next # select neg idx neg_next = neg_pointer+batch_neg_cand neg_cur_idx = neg_idx[neg_pointer:neg_next] neg_cur_idx = neg_feats.new(neg_cur_idx).long() neg_pointer = neg_next # create batch batch_pos_feats = Variable(pos_feats.index_select(0, pos_cur_idx)) batch_neg_feats = Variable(neg_feats.index_select(0, neg_cur_idx)) # hard negative mining if batch_neg_cand > batch_neg: model.eval() for start in range(0, batch_neg_cand, batch_test): end = min(start+batch_test, batch_neg_cand) score = model(batch_neg_feats[start:end], in_layer=in_layer) if start==0: neg_cand_score = score.data[:, 1].clone() else: neg_cand_score = torch.cat((neg_cand_score, score.data[:, 1].clone()), 0) _, top_idx = neg_cand_score.topk(batch_neg) batch_neg_feats = batch_neg_feats.index_select(0, Variable(top_idx)) # mask positive features using GAN batch_pos_feats_backup = batch_pos_feats.data.clone() GAN_model.eval() feat_asdn = GAN_model(batch_pos_feats).view(-1, 3, 3) num = feat_asdn.shape[0] mask_asdn = torch.ones(num, 512, 3, 3) if opts['use_gpu']: mask_asdn = mask_asdn.cuda() for i in range(num): feat_ = feat_asdn[0, :].data.clone() _, idxlist = torch.topk(feat_, 3, largest=False) for j in range(len(idxlist)): idx = idxlist[j] row = int(math.floor(j/3)) col = j % 3 mask_asdn[:, :, col, row] = 0 batch_pos_feats = batch_pos_feats.mul(mask_asdn.view(num, -1)) # forward model.train() pos_score = model(batch_pos_feats, in_layer=in_layer) neg_score = model(batch_neg_feats, in_layer=in_layer) # optimize loss = criterion(pos_score, neg_score) model.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), opts['grad_clip']) optimizer.step() print('Finetune FC: Iter' + str(iter+1) + ', Loss ' + str(loss.item())) # --------- train GAN --------- tic = time.time() GAN_mask_batch_size = opts['GAN_batch_size'] objective = torch.zeros(1, maxiter) # Evaluate mask # print('Evaluating Mask') n = pos_feats.size(0) prob_k = torch.zeros(9, 1) for k in range(0, 9): row = int(math.floor(k/3)) col = k % 3 batch = batch_pos_feats_backup.data.clone() batch = batch.view(-1, 512, 3, 3) batch[:, :, col, row] = 0 batch = batch.view(batch.size(0), -1) # prepare label model.eval() feats = model(batch, in_layer='fc4') # calcute zero position X = feats X_max = torch.max(feats, dim=1)[0] X_max = X_max.repeat(2, 1).permute(1, 0) E = torch.exp(feats-X_max) L = torch.sum(E, 1) Y = torch.div(E, L.repeat(2, 1).permute(1, 0)) prob_k[k] = torch.sum(Y, dim=0)[0] # print('mask {}, value: {:.3f}'.format(k, prob_k[k][0])) _, idx = torch.min(prob_k, 0) row = int(math.floor(idx/3)) col = idx % 3 # train batch = batch_pos_feats_backup.data.clone() GAN_batch_size = opts['GAN_batch_size'] labels = torch.ones(3, 3, 1, GAN_batch_size) labels[col, row, :] = 0 if opts['use_gpu']: labels = labels.cuda() GAN_model.train() GAN_score = GAN_model(batch).view(3, 3, 1, GAN_batch_size) # optimize GAN_loss = GAN_criterion(GAN_score, labels) GAN_model.zero_grad() GAN_loss.backward() torch.nn.utils.clip_grad_norm_(GAN_model.parameters(), opts['grad_clip']) GAN_optimizer.step() # result objective[:, iter] = loss.item() / GAN_batch_size print "Finetun GAN: Iter %d, Loss %.4f, Time %.3f" % (iter+1, torch.mean(objective[:, 0:iter+1], dim=1).data, time.time()-tic) return
def l2_norm(input, axit=1): norm = torch.norm(input,2,axit,True) output = torch.div(input, norm) return output
def divide(self, tensor_in_1, tensor_in_2): return torch.div(tensor_in_1, tensor_in_2)
def forward(self, feature): epsilon = 1e-6 norm = torch.pow(torch.sum(torch.pow(feature, 2), 1) + epsilon, 0.5).unsqueeze(1).expand_as(feature) return torch.div(feature, norm)
def prior_loss(prior_std): prior_loss = 0.0 for var in net.parameters(): nn = torch.div(var, prior_std) prior_loss += torch.sum(nn * nn) return 0.5 * prior_loss
def _sample(self, img, ppls, num, pos_emb=None, spa_adj_matrix=None, sem_adj_matrix=None, opt={}): sample_max = opt.get('sample_max', 1) beam_size = opt.get('beam_size', 1) temperature = opt.get('temperature', 1.0) inference_mode = opt.get('inference_mode', True) batch_size = img.size(0) rois_num = ppls.size(1) if beam_size > 1 or self.cbs: return self._sample_beam(img, ppls, num, pos_emb, spa_adj_matrix, sem_adj_matrix, opt) if self.finetune_cnn: conv_feats, fc_feats = self.cnn(img) else: with torch.no_grad(): conv_feats, fc_feats = self.cnn(img.data) # conv_feats, fc_feats = self.cnn(Variable(img.data, volatile=True)) # conv_feats = Variable(conv_feats.data) # fc_feats = Variable(fc_feats.data) # conv_feats, fc_feats = self.cnn(img) rois = ppls.data.new(batch_size, rois_num, 5) rois[:, :, 1:] = ppls.data[:, :, :4] for i in range(batch_size): rois[i, :, 0] = i pool_feats = self.roi_align(conv_feats, Variable(rois.view(-1, 5))) pool_feats = pool_feats.view(batch_size, rois_num, self.att_feat_size) # relationship pool_feats, _ = self.add_relation_feat(pool_feats, pos_emb, spa_adj_matrix, sem_adj_matrix) loc_input = ppls.data.new(batch_size, rois_num, 5) loc_input[:, :, :4] = ppls.data[:, :, :4] / self.image_crop_size loc_input[:, :, 4] = ppls.data[:, :, 5] loc_feats = self.loc_fc(Variable(loc_input)) label_input = ppls.data.new(batch_size, rois_num).long() label_input[:, :] = ppls.data[:, :, 4] label_feat = self.det_fc(Variable(label_input)) # pool_feats = pool_feats + label_feat pool_feats = torch.cat((pool_feats, loc_feats, label_feat), 2) # transpose the conv_feats conv_feats = conv_feats.view(batch_size, self.att_feat_size, -1).transpose(1, 2).contiguous() # embed fc and att feats pool_feats = self.pool_embed(pool_feats) fc_feats = self.fc_embed(fc_feats) conv_feats = self.att_embed(conv_feats) # Project the attention feats first to reduce memory and computation comsumptions. p_conv_feats = self.ctx2att(conv_feats) p_pool_feats = self.ctx2pool(pool_feats) vis_offset = (torch.arange(0, batch_size) * rois_num).view(batch_size).type_as(ppls.data).long() roi_offset = (torch.arange(0, batch_size) * (rois_num + 1)).view(batch_size).type_as( ppls.data).long() # constructing the mask. pnt_mask = ppls.data.new(batch_size, rois_num + 1).byte().fill_(1) for i in range(batch_size): pnt_mask[i, :num.data[i, 1] + 1] = 0 pnt_mask = Variable(pnt_mask) pnt_mask_list = [] pnt_mask_list.append(pnt_mask) att_mask = pnt_mask.clone() state = self.init_hidden(batch_size) seq = [] seqLogprobs = [] bn_seq = [] bnLogprobs = [] fg_seq = [] fgLogprobs = [] for t in range(self.seq_length + 1): if t == 0: # input <bos> it = fc_feats.data.new(batch_size).long().zero_() elif sample_max: sampleLogprobs, it = torch.max(logprobs.data, 1) it = it.view(-1).long() else: if temperature == 1.0: prob_prev = torch.exp( logprobs.data ) # fetch prev distribution: shape Nx(M+1) else: # scale logprobs by temperature prob_prev = torch.exp(torch.div(logprobs.data, temperature)) it = torch.multinomial(prob_prev, 1) sampleLogprobs = logprobs.gather( 1, Variable(it)) # gather the logprobs at sampled positions it = it.view( -1).long() # and flatten indices for downstream processing roi_idx = it.clone() - self.vocab_size - 1 # starting from 0 roi_mask = roi_idx < 0 roi_idx_offset = roi_idx + vis_offset roi_idx_offset[roi_mask] = 0 vis_idx = ppls.data[:, :, 4].clone().view(-1)[roi_idx_offset].long() vis_idx[roi_mask] = 0 # if inference_mode: # if the roi_idx is selected, we need to make sure this is not selected again. pnt_idx_offset = roi_idx + roi_offset + 1 pnt_idx_offset[roi_mask] = 0 pnt_mask_new = pnt_mask_list[-1].data.clone() pnt_mask_new.view(-1)[pnt_idx_offset] = 1 pnt_mask_new.view(-1)[0] = 0 pnt_mask_list.append(Variable(pnt_mask_new)) # tmp_feat = concat_feat.view(-1, self.rnn_size)[tmp_idx_offset] # we need to convert the roi index to label index. it_new = it.clone() it_new[it > self.vocab_size] = (vis_idx[roi_mask == 0] + self.vocab_size) xt = self.embed(Variable(it_new)) if t >= 1: # do the cascade caption refinement here roi_labels = pool_feats.data.new(batch_size * rois_num).zero_() if (roi_mask == 0).sum() > 0: roi_labels[roi_idx_offset[roi_mask == 0]] = 1 roi_labels = roi_labels.view(batch_size, 1, rois_num) bn_logprob, fg_logprob = self.ccr_core(vis_idx, pool_feats, \ rnn_output.view(batch_size, 1, self.rnn_size), Variable(roi_labels), batch_size, 1) bn_logprob = bn_logprob.view(batch_size, -1) fg_logprob = fg_logprob.view(batch_size, -1) if sample_max: slp_bn, it_bn = torch.max(bn_logprob.data, 1) slp_fg, it_fg = torch.max(fg_logprob.data, 1) else: if temperature == 1.0: bn_prob_prev = torch.exp(bn_logprob.data) fg_prob_prev = torch.exp(fg_logprob.data) else: bn_prob_prev = torch.exp( torch.div(bn_logprob.data, temperature)) fg_prob_prev = torch.exp( torch.div(fg_logprob.data, temperature)) it_bn = torch.multinomial(bn_prob_prev, 1) it_fg = torch.multinomial(fg_prob_prev, 1) slp_bn = bn_logprob.gather(1, Variable( it_bn)) # gather the logprobs at sampled positions slp_fg = fg_logprob.gather(1, Variable( it_fg)) # gather the logprobs at sampled positions it_bn[roi_mask] = 0 it_fg[roi_mask] = 0 # stop when all finished if t == 1: unfinished = it > 0 else: unfinished = unfinished * (it > 0) # if unfinished.sum() == 0: # break # continue it = it * unfinished.type_as(it) seq.append(it) # seq[t] the input of t+2 time step seqLogprobs.append(sampleLogprobs.view(-1)) bn_seq.append(it_bn) bnLogprobs.append(slp_bn.view(-1)) fg_seq.append(it_fg) fgLogprobs.append(slp_fg.view(-1)) rnn_output, det_prob, state = self.core(xt, fc_feats, conv_feats, p_conv_feats, pool_feats, p_pool_feats, att_mask, pnt_mask_list[-1], state) # pnt_mask = pnt_mask_new # update the mask det_prob = F.log_softmax(det_prob, dim=1) decoded = F.log_softmax(self.beta * self.logit(rnn_output), dim=1) lambda_v = det_prob[:, 0].contiguous() prob_det = det_prob[:, 1:].contiguous() decoded = decoded + lambda_v.view(batch_size, 1).expand_as(decoded) logprobs = torch.cat([decoded, prob_det], 1) # logprobs = torch.log(decoded) seq = torch.cat([_.unsqueeze(1) for _ in seq], 1) seqLogprobs = torch.cat([_.unsqueeze(1) for _ in seqLogprobs], 1) bn_seq = torch.cat([_.unsqueeze(1) for _ in bn_seq], 1) bnLogprobs = torch.cat([_.unsqueeze(1) for _ in bnLogprobs], 1) fg_seq = torch.cat([_.unsqueeze(1) for _ in fg_seq], 1) fgLogprobs = torch.cat([_.unsqueeze(1) for _ in fgLogprobs], 1) return seq, bn_seq, fg_seq, seqLogprobs, bnLogprobs, fgLogprobs
import numpy as np import torch import torch.nn as nn # ----------------------------------- KLDiv loss loss_f = nn.KLDivLoss(reduction='none') loss_f_mean = nn.KLDivLoss(reduction='batchmean') # 生成网络输出 以及 目标输出 output = torch.from_numpy(np.array([[0.1132, 0.5477, 0.3390]])).float() output.requires_grad = True target = torch.from_numpy(np.array([[0.8541, 0.0511, 0.0947]])).float() loss_1 = loss_f(output, target) loss_mean = loss_f_mean(output, target) print('\nloss: ', loss_1) print('\nloss_mean: ', torch.div(loss_mean, 3)) # 熟悉计算公式,手动计算样本的第一个元素的loss,注意这里只有一个样本,是 element-wise计算的 output = output[0].detach().numpy() output_1 = output[0] # 第一个样本的第一个元素 target_1 = target[0][0].numpy() loss_1 = target_1 * (np.log(target_1) - output_1) print('\n第一个样本第一个元素的loss:', loss_1)
def conditional_distributions_loss(model, x, t, e, pdf_u, pdf_c, hr_loss=False, imbalance_loss=False, elbo=True, risk='1'): shape_weibull, scale_weibull, gates_weibull, shape_lognormal, scale_lognormal, logits_lognormal, attention_weights = model.forward( x) lossf_lognormal = [] losss_lognormal = [] hr_lognormal = [] for g in range(model.k): mu = shape_lognormal[:, g] sigma = scale_lognormal[:, g] f = -sigma - 0.5 * np.log(2 * np.pi) f = f - torch.div((torch.log(t) - mu)**2, 2. * torch.exp(2 * sigma)) s = torch.div(torch.log(t) - mu, torch.exp(sigma) * np.sqrt(2)) s = 0.5 - 0.5 * torch.erf(s) s = torch.log(s) lossf_lognormal.append(f) losss_lognormal.append(s) # negative partial log likelihood hr_lognormal.append(f - s) losss_lognormal = torch.stack(losss_lognormal, dim=1) lossf_lognormal = torch.stack(lossf_lognormal, dim=1) hr_lognormal = torch.stack(hr_lognormal, dim=1) if elbo: lossg_lognormal = nn.Softmax(dim=1)(logits_lognormal) losss_lognormal = lossg_lognormal * losss_lognormal lossf_lognormal = lossg_lognormal * lossf_lognormal losss_lognormal = losss_lognormal.sum(dim=1) lossf_lognormal = lossf_lognormal.sum(dim=1) hr_lognormal = lossg_lognormal * hr_lognormal hr_lognormal = hr_lognormal.sum(dim=1) else: lossg_lognormal = nn.LogSoftmax(dim=1)(logits_lognormal) losss_lognormal = lossg_lognormal + losss_lognormal lossf_lognormal = lossg_lognormal + lossf_lognormal losss_lognormal = torch.logsumexp(losss_lognormal, dim=1) lossf_lognormal = torch.logsumexp(lossf_lognormal, dim=1) # Weibull distriubtion shapes_weibull, scales_weibull = shape_weibull.exp(), ( -scale_weibull).exp() lossf_weibull, losss_weibull = [], [] hr_weibull = [] for idx in range(model.k): eta = shapes_weibull[:, idx] beta = scales_weibull[:, idx] log_s_weibull = -(torch.pow(t / beta, eta)) log_f_weibull = torch.log(eta) - torch.log(beta) + ( (eta - 1) * (-torch.log(beta) + torch.log(t))) log_f_weibull = log_f_weibull + log_s_weibull lossf_weibull.append(log_f_weibull) losss_weibull.append(log_s_weibull) # negative partial log likelihood hr_weibull.append(torch.log(eta / beta * (t / beta)**(eta - 1))) losss_weibull = torch.stack(losss_weibull, dim=1) lossf_weibull = torch.stack(lossf_weibull, dim=1) hr_weibull = torch.stack(hr_weibull, dim=1) if elbo: lossg_weibull = nn.Softmax(dim=1)(gates_weibull) losss_weibull = lossg_weibull * losss_weibull lossf_weibull = lossg_weibull * lossf_weibull losss_weibull = losss_weibull.sum(dim=1) lossf_weibull = lossf_weibull.sum(dim=1) hr_weibull = hr_weibull * lossg_weibull hr_weibull = hr_weibull.sum(dim=1) else: lossg_weibull = nn.LogSoftmax(dim=1)(gates_weibull) losss_weibull = lossg_weibull + losss_weibull lossf_weibull = lossg_weibull + lossf_weibull losss_weibull = torch.logsumexp(losss_weibull, dim=1) lossf_weibull = torch.logsumexp(lossf_weibull, dim=1) # Combine lossf, losss = torch.stack([lossf_lognormal, lossf_weibull], dim=1), torch.stack( [losss_lognormal, losss_weibull], dim=1) weights = nn.Softmax(dim=1)(attention_weights) #hr = torch.stack([hr_weibull, hr_lognormal], dim=1) hr = torch.stack( [lossf_lognormal - losss_lognormal, lossf_weibull - losss_weibull], dim=1) hr = hr * weights hr = hr.sum(dim=1) loss_neg = PartialLogLikelihood()(hr, e) lossf = lossf * weights losss = losss * weights lossf = lossf.sum(dim=1) losss = losss.sum(dim=1) # if imbalance_loss: try: idx_time = t.int().cpu().detach().numpy() pdf_u_ = torch.tensor(pdf_u).cuda() pdf_c_ = torch.tensor(pdf_c).cuda() lossf = lossf * (1 - pdf_u_[idx_time]) #.exp() losss = losss * (1 - pdf_c_[idx_time]) #.exp() except: pass uncens = np.where(e.cpu().data.numpy() == int(risk))[0] cens = np.where(e.cpu().data.numpy() != int(risk))[0] ll = lossf[uncens].sum() + model.discount * losss[cens].sum() if hr_loss and e.sum() > 0: return -ll / float(len(uncens) + len(cens)) + loss_neg * model.gamma else: return -ll / float(len(uncens) + len(cens))
def forward(self, x): norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps x = torch.div(x, norm) out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x return out
def _generate(self, encoder_input, beam_size=None, maxlen=None, prefix_tokens=None): src_tokens = encoder_input[0] bsz, srclen = src_tokens.size() maxlen = min(maxlen, self.maxlen) if maxlen is not None else self.maxlen # the max beam size is the dictionary size - 1, since we never select pad beam_size = beam_size if beam_size is not None else self.beam_size assert ( beam_size < self.vocab_size ), "Beam size must be smaller than target vocabulary" # Encode, expanding outputs for each example beam_size times reorder_indices = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) encoder_outs, incremental_states = self._encode( encoder_input=encoder_input, reorder_indices=reorder_indices.type_as(src_tokens), ) # initialize buffers scores = src_tokens.new(bsz * beam_size, maxlen + 1).float().fill_(0) scores_buf = scores.clone() tokens = src_tokens.new(bsz * beam_size, maxlen + 2).fill_(self.pad) tokens_buf = tokens.clone() tokens[:, 0] = self.eos # may differ from input length if isinstance(encoder_outs[0], (list, tuple)): src_encoding_len = encoder_outs[0][0].size(0) elif isinstance(encoder_outs[0], dict): src_encoding_len = encoder_outs[0]["encoder_out"].size(0) attn = scores.new(bsz * beam_size, src_encoding_len, maxlen + 2) attn_buf = attn.clone() # list of completed sentences finalized = [[] for i in range(bsz)] finished = [False for i in range(bsz)] worst_finalized = [{"idx": None, "score": -math.inf} for i in range(bsz)] num_remaining_sent = bsz # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) # helper function for allocating buffers on the fly buffers = {} def buffer(name, type_of=tokens): # noqa if name not in buffers: buffers[name] = type_of.new() return buffers[name] def is_finished(sent, step, unfinalized_scores=None): """ Check whether we've finished generation for a given sentence, by comparing the worst score among finalized hypotheses to the best possible score among unfinalized hypotheses. """ assert len(finalized[sent]) <= beam_size if len(finalized[sent]) == beam_size: if self.stop_early or step == maxlen or unfinalized_scores is None: return True # stop if the best unfinalized score is worse than the worst # finalized one best_unfinalized_score = unfinalized_scores[sent].max() if self.normalize_scores: best_unfinalized_score /= (maxlen + 1) ** self.len_penalty if worst_finalized[sent]["score"] >= best_unfinalized_score: return True return False def finalize_hypos(step, bbsz_idx, eos_scores, unfinalized_scores=None): """ Finalize the given hypotheses at this step, while keeping the total number of finalized hypotheses per sentence <= beam_size. Note: the input must be in the desired finalization order, so that hypotheses that appear earlier in the input are preferred to those that appear later. Args: step: current time step bbsz_idx: A vector of indices in the range [0, bsz*beam_size), indicating which hypotheses to finalize eos_scores: A vector of the same size as bbsz_idx containing scores for each hypothesis unfinalized_scores: A vector containing scores for all unfinalized hypotheses """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors tokens_clone = tokens.index_select(0, bbsz_idx) tokens_clone = tokens_clone[ :, 1 : step + 2 ] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2] # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1) ** self.len_penalty sents_seen = set() for i, (idx, score) in enumerate( zip(bbsz_idx.tolist(), eos_scores.tolist()) ): sent = idx // beam_size sents_seen.add(sent) def get_hypo(): _, alignment = attn_clone[i].max(dim=0) return { "tokens": tokens_clone[i], "score": score, "attention": attn_clone[i], # src_len x tgt_len "alignment": alignment, "positional_scores": pos_scores[i], } if len(finalized[sent]) < beam_size: finalized[sent].append(get_hypo()) elif not self.stop_early and score > worst_finalized[sent]["score"]: # replace worst hypo for this sentence with new/better one worst_idx = worst_finalized[sent]["idx"] if worst_idx is not None: finalized[sent][worst_idx] = get_hypo() # find new worst finalized hypo for this sentence idx, s = min( enumerate(finalized[sent]), key=lambda r: r[1]["score"] ) worst_finalized[sent] = {"score": s["score"], "idx": idx} # return number of hypotheses finished this step num_finished = 0 for sent in sents_seen: # check termination conditions for this sentence if not finished[sent] and is_finished(sent, step, unfinalized_scores): finished[sent] = True num_finished += 1 return num_finished reorder_state = None for step in range(maxlen + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: for model in self.models: if isinstance(model.decoder, FairseqIncrementalDecoder): model.decoder.reorder_incremental_state( incremental_states[model], reorder_state ) # Run decoder for one step logprobs, avg_attn, possible_translation_tokens = self._decode( tokens[:, : step + 1], encoder_outs, incremental_states ) if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam logprobs = logprobs.unfold(0, 1, beam_size).squeeze(2).contiguous() scores = scores.type_as(logprobs) scores_buf = scores_buf.type_as(logprobs) else: # make probs contain cumulative scores for each hypothesis logprobs.add_(scores[:, step - 1].view(-1, 1)) logprobs[:, self.pad] = -math.inf # never select pad # apply unk reward if possible_translation_tokens is None: unk_index = self.unk else: unk_index = torch.nonzero(possible_translation_tokens == self.unk)[0, 0] logprobs[:, unk_index] += self.unk_reward # external lexicon reward logprobs[:, self.lexicon_indices] += self.lexicon_reward logprobs += self.word_reward logprobs[:, self.eos] -= self.word_reward # Record attention scores attn[:, :, step + 1].copy_(avg_attn) cand_scores = buffer("cand_scores", type_of=scores) cand_indices = buffer("cand_indices") cand_beams = buffer("cand_beams") eos_bbsz_idx = buffer("eos_bbsz_idx") eos_scores = buffer("eos_scores", type_of=scores) if step < maxlen: if prefix_tokens is not None and step < prefix_tokens.size(1): logprobs_slice = logprobs.view(bsz, -1, logprobs.size(-1))[:, 0, :] cand_scores = torch.gather( logprobs_slice, dim=1, index=prefix_tokens[:, step].view(-1, 1) ).expand(-1, cand_size) cand_indices = ( prefix_tokens[:, step].view(-1, 1).expand(bsz, cand_size) ) cand_beams.resize_as_(cand_indices).fill_(0) else: # take the best 2 x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. torch.topk( logprobs.view(bsz, -1), k=min( cand_size, logprobs.view(bsz, -1).size(1) - 1 ), # -1 so we never select pad out=(cand_scores, cand_indices), ) possible_tokens_size = self.vocab_size if possible_translation_tokens is not None: possible_tokens_size = possible_translation_tokens.size(0) # cand_indices has values in [0, vocab_size * beam_size] # the following does euclidean division bu vocab_size # to retrieve the beam and word id of each candidate torch.div(cand_indices, possible_tokens_size, out=cand_beams) cand_indices.fmod_(possible_tokens_size) # Handle vocab reduction if possible_translation_tokens is not None: possible_translation_tokens = possible_translation_tokens.view( 1, possible_tokens_size ).expand(cand_indices.size(0), possible_tokens_size) cand_indices = torch.gather( possible_translation_tokens, dim=1, index=cand_indices, out=cand_indices, ) else: # finalize all active hypotheses once we hit maxlen # pick the hypothesis with the highest log prob of EOS right now torch.sort( logprobs[:, self.eos], descending=True, out=(eos_scores, eos_bbsz_idx), ) num_remaining_sent -= finalize_hypos(step, eos_bbsz_idx, eos_scores) assert num_remaining_sent == 0 break # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add_(bbsz_offsets) # finalize hypotheses that end in eos eos_mask = cand_indices.eq(self.eos) if step >= self.minlen: # only consider eos when it's among the top beam_size indices torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_bbsz_idx, ) if eos_bbsz_idx.numel() > 0: torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size], out=eos_scores, ) num_remaining_sent -= finalize_hypos( step, eos_bbsz_idx, eos_scores, cand_scores ) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break assert step < maxlen # set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos active_mask = buffer("active_mask") torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[: eos_mask.size(1)], out=active_mask, ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask active_hypos, _ignore = buffer("active_hypos"), buffer("_ignore") torch.topk( active_mask, k=beam_size, dim=1, largest=False, out=(_ignore, active_hypos), ) active_bbsz_idx = buffer("active_bbsz_idx") torch.gather(cand_bbsz_idx, dim=1, index=active_hypos, out=active_bbsz_idx) active_scores = torch.gather( cand_scores, dim=1, index=active_hypos, out=scores[:, step].view(bsz, beam_size), ) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses torch.index_select( tokens[:, : step + 1], dim=0, index=active_bbsz_idx, out=tokens_buf[:, : step + 1], ) torch.gather( cand_indices, dim=1, index=active_hypos, out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], ) if step > 0: torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx, out=scores_buf[:, :step], ) torch.gather( cand_scores, dim=1, index=active_hypos, out=scores_buf.view(bsz, beam_size, -1)[:, :, step], ) # copy attention for active hypotheses torch.index_select( attn[:, :, : step + 2], dim=0, index=active_bbsz_idx, out=attn_buf[:, :, : step + 2], ) # swap buffers tokens, tokens_buf = tokens_buf, tokens scores, scores_buf = scores_buf, scores attn, attn_buf = attn_buf, attn # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(bsz): finalized[sent] = sorted( finalized[sent], key=lambda r: r["score"], reverse=True ) return finalized
def sinkhorn_knopp(a, b, C, reg=1e-1, maxIter=1000, stopThr=1e-9, verbose=False, log=False, warm_start=None, eval_freq=10, print_freq=200, **kwargs): """ Solve the entropic regularization optimal transport The input should be PyTorch tensors The function solves the following optimization problem: .. math:: \gamma = arg\min_\gamma <\gamma,C>_F + reg\cdot\Omega(\gamma) s.t. \gamma 1 = a \gamma^T 1= b \gamma\geq 0 where : - C is the (ns,nt) metric cost matrix - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - a and b are target and source measures (sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [1]. Parameters ---------- a : torch.tensor (na,) samples measure in the target domain b : torch.tensor (nb,) samples in the source domain C : torch.tensor (na,nb) loss matrix reg : float Regularization term > 0 maxIter : int, optional Max number of iterations stopThr : float, optional Stop threshol on error ( > 0 ) verbose : bool, optional Print information along iterations log : bool, optional record log if True Returns ------- gamma : (na x nb) torch.tensor Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters References ---------- [1] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 See Also -------- """ device = a.device na, nb = C.shape assert na >= 1 and nb >= 1, 'C needs to be 2d' assert na == a.shape[0] and nb == b.shape[ 0], "Shape of a or b does't match that of C" assert reg > 0, 'reg should be greater than 0' # assert a.min() >= 0. and b.min() >= 0., 'Elements in a or b less than 0' # unnecessary check for our special case if log: log = {'err': []} if warm_start is not None: u = warm_start['u'] v = warm_start['v'] else: u = torch.ones(na, dtype=a.dtype).to(device) / na v = torch.ones(nb, dtype=b.dtype).to(device) / nb K = torch.empty(C.shape, dtype=C.dtype).to(device) torch.div(C, -reg, out=K) torch.exp(K, out=K) b_hat = torch.empty(b.shape, dtype=C.dtype).to(device) it = 1 err = 1 # allocate memory beforehand KTu = torch.empty(v.shape, dtype=v.dtype).to(device) Kv = torch.empty(u.shape, dtype=u.dtype).to(device) while (err > stopThr and it <= maxIter): upre, vpre = u, v torch.matmul(u, K, out=KTu) v = torch.div(b, KTu + M_EPS) torch.matmul(K, v, out=Kv) u = torch.div(a, Kv + M_EPS) if torch.any(torch.isnan(u)) or torch.any(torch.isnan(v)) or \ torch.any(torch.isinf(u)) or torch.any(torch.isinf(v)): print('Warning: numerical errors at iteration', it) u, v = upre, vpre break if log and it % eval_freq == 0: # we can speed up the process by checking for the error only all # the eval_freq iterations # below is equivalent to: # b_hat = torch.sum(u.reshape(-1, 1) * K * v.reshape(1, -1), 0) # but with more memory efficient b_hat = torch.matmul(u, K) * v err = (b - b_hat).pow(2).sum().item() # err = (b - b_hat).abs().sum().item() log['err'].append(err) if verbose and it % print_freq == 0: print('iteration {:5d}, constraint error {:5e}'.format(it, err)) it += 1 if log: log['u'] = u log['v'] = v log['alpha'] = reg * torch.log(u + M_EPS) log['beta'] = reg * torch.log(v + M_EPS) # transport plan P = u.reshape(-1, 1) * K * v.reshape(1, -1) if log: return P, log else: return P
def main(exp_config): # ===================== # Load network # ===================== model = models.ResNet34(num_c=exp_config.num_classes) summary(model.cuda(), input_size=(3, 32, 32)) # display the layers of the network model.cuda() # copy the model into gpu # ========================= # Load source dataset and pre-trained model # ========================= source_train_loader, source_test_loader, _ = load_datasets( exp_config.data_identifier_source, exp_config.batch_size) model.load_state_dict(torch.load(exp_config.pre_trained_net)) model.eval() # ========================= # KDE-based OOD detection # ========================= # Open a .txt file to save the OOD detection results path_to_saved_results = 'results/' + exp_config.experiment_name + '/' + exp_config.method_name + '/results_' + str( exp_config.number_of_samples_for_KDE) + '.txt' f = open(path_to_saved_results, "w") # Compute number of layers in the network num_layers = KDE.compute_num_layers(exp_config, model) # Compute features for each channel for the test set of in-distribution dataset # get_features function returns MxN tensor where M is the number of samples # and N is the number of channels print('Calculating features for the test set of in-distribution dataset') feature_in_test = KDE.get_features(exp_config, model, num_layers, source_test_loader) # Compute features for each channel for the training set of in-distribution dataset print( 'Calculating features for the training set of in-distribution dataset') feature_in_train = KDE.get_features(exp_config, model, num_layers, source_train_loader, is_train=True) # Compute features for each channel for the adversarially perturbed version of training set of in-distribution dataset print('Calculating features for the adversarial images') feature_in_train_perturbed = KDE.get_features(exp_config, model, num_layers, source_train_loader, perturb=True, is_train=True) # Calculate features for each OOD dataset print('Calculating features for each OOD dataset') feature_ood = {} for target in exp_config.data_identifier_target: _, ood_loader, _ = load_datasets(target, exp_config.batch_size) feature_ood[target] = KDE.get_features(exp_config, model, num_layers, ood_loader) if exp_config.std_type == 'kNN': # Load pre-computed sigma values for each channel using kNN as proposed in the paper - COMPUTATIONALLY INEFFICIENT std = torch.Tensor( np.load('results/std_%s.npy' % (exp_config.data_identifier_source))).cuda() elif exp_config.std_type == 'interquartile': # Compute signa values for each channel using interquartiles - COMPUTATIONALLY EFFICIENT AND LEADS TO SIMILAR RESULTS IN THE PAPER sorted_feature_in_train, _ = torch.sort(feature_in_train, axis=0) emp_std = torch.std(feature_in_test, axis=0) Q1, Q3 = torch.median( sorted_feature_in_train[0:sorted_feature_in_train.shape[0] // 2], axis=0).values, torch.median( sorted_feature_in_train[(sorted_feature_in_train.shape[0] // 2):], axis=0).values IQR = Q3 - Q1 std = 0.9 * torch.min(torch.cat( [torch.unsqueeze(emp_std, 0), torch.unsqueeze(IQR, 0) / 1.34], axis=0), axis=0).values * (feature_in_train.shape[0] **(-1 / 5)) # Calculate confidence scores using KDE for test set of the in-distribution dataset print( 'Calculating confidence scores using KDE for the test set of the in-distribution dataset' ) constant = 1 / (std * torch.sqrt(torch.Tensor([2 * math.pi]).cuda())) scores_in_test = 0 for i in range(feature_in_train.shape[0]): zero_x = feature_in_test - feature_in_train[i] scores_in_test += constant * torch.exp( -0.5 * (torch.pow(torch.div(zero_x, std), 2))) scores_in_test /= feature_in_train.shape[0] scores_in_test = scores_in_test.detach().cpu().numpy() # Calculate confidence scores using KDE for training set of the in-distribution dataset print( 'Calculating confidence scores using KDE for the training set of the in-distribution dataset' ) scores_in_train = 0 for i in range(feature_in_train.shape[0]): zero_x = feature_in_train - feature_in_train[i] scores_in_train += constant * torch.exp( -0.5 * (torch.pow(torch.div(zero_x, std), 2))) scores_in_train /= feature_in_train.shape[0] scores_in_train = scores_in_train.detach().cpu().numpy() # Calculate confidence scores using KDE for the adversarially perturbed version of training set of the in-distribution dataset print('Calculating confidence scores using KDE for the adversarial images') scores_in_train_perturbed = 0 for i in range(feature_in_train.shape[0]): zero_x = feature_in_train_perturbed - feature_in_train[i] scores_in_train_perturbed += constant * torch.exp( -0.5 * (torch.pow(torch.div(zero_x, std), 2))) scores_in_train_perturbed /= feature_in_train.shape[0] scores_in_train_perturbed = scores_in_train_perturbed.detach().cpu().numpy( ) # Calculate confidence scores using KDE for OOD datasets print('Calculating confidence scores using KDE for OOD datasets') scores_ood = {} for target in exp_config.data_identifier_target: scores_ood[target] = 0 for i in range(feature_in_train.shape[0]): zero_x = feature_ood[target] - feature_in_train[i] scores_ood[target] += constant * torch.exp( -0.5 * (torch.pow(torch.div(zero_x, std), 2))) scores_ood[target] /= feature_in_train.shape[0] scores_ood[target] = scores_ood[target].detach().cpu().numpy() # Calculate OOD detection accuracy print('Calculating OOD detection accuracy') # Find channels that best distinguishes scores of in-distribution test set from the adversarial images y_pred = np.concatenate((scores_in_test, scores_in_train_perturbed), axis=0) label = np.concatenate((np.ones(scores_in_test.shape[0]), np.zeros(scores_in_train_perturbed.shape[0])), axis=0) fpr_all = [] for i in range(scores_in_test.shape[1]): fpr_at_95_tpr, detection_error, auroc, aupr_in = calculate_ood_detection_performance_metrics( label, y_pred[:, i], str(i), display=False) fpr_all.append(fpr_at_95_tpr) # Create training set to train logistic regression X_train = np.concatenate( (np.sort(scores_in_train[:, np.argsort(fpr_all)[:50]], axis=1), np.sort(scores_in_train_perturbed[:, np.argsort(fpr_all)[:50]], axis=1)), axis=0) Y_train = np.concatenate((np.zeros(scores_in_train.shape[0]), np.ones(scores_in_train_perturbed.shape[0])), axis=0) # Train logistic regression lr = LogisticRegressionCV(n_jobs=-1).fit(X_train, Y_train) # Evaluate logistic regression on each OOD dataset and compute OOD detection accuracy f.write('Target \t\t FPRat95TPR \t DetErr \t AUROC \t\t AUPR_IN \n') print('Target \t\t TPRat95TPR \t DetErr \t AUROC \t\t AUPR_IN \n') for target in exp_config.data_identifier_target: X_test = np.concatenate( (np.sort(scores_in_test[:, np.argsort(fpr_all)[:50]], axis=1), np.sort(scores_ood[target][:, np.argsort(fpr_all)[:50]], axis=1)), axis=0) Y_test = np.concatenate((np.zeros( scores_in_test.shape[0]), np.ones(scores_ood[target].shape[0])), axis=0) y_pred = lr.predict_proba(X_test)[:, 1] fpr_at_95_tpr, detection_error, auroc, aupr_in = calculate_ood_detection_performance_metrics( Y_test, y_pred, target, display=True) f.write(('%8s \t %.5f \t %.5f \t %.5f \t %.5f \n\n') % (target, fpr_at_95_tpr, detection_error, auroc, aupr_in)) print('Results are saved to ' + path_to_saved_results) f.close()
def forward(self, q, q_len): embedded = self.embedding(q) q_len = Variable(torch.Tensor(q_len).view(-1, 1) + 1e-12, requires_grad=False).cuda() return torch.div( torch.sum(embedded, 1), q_len )
def attack_dataset(self, args, arch, result_dump_path): success = 0 queries = [] not_done = [] correct_all = [] total = 0 for batch_idx, data_tuple in enumerate(self.data_loader): if args.dataset == "ImageNet": if self.model.input_size[-1] >= 299: images, true_labels = data_tuple[1], data_tuple[2] else: images, true_labels = data_tuple[0], data_tuple[2] else: images, true_labels = data_tuple[0], data_tuple[1] if images.size(-1) != self.model.input_size[-1]: images = F.interpolate(images, size=self.model.input_size[-1], mode='bilinear', align_corners=True) self.image_height = images.size(2) self.image_width = images.size(3) eps = args.epsilon if args.norm == 'l2': # epsilon = 1e-3 # eps = np.sqrt(epsilon * model.input_size[-1] * model.input_size[-1] * self.in_channels) # 1.752 learning_rate = 2.0 / np.sqrt( self.image_height * self.image_width * self.in_channels) else: learning_rate = 0.005 images = images.cuda() true_labels = true_labels.cuda() with torch.no_grad(): logits = self.model(images) pred = logits.argmax(dim=1) correct = pred.eq(true_labels).detach().cpu().numpy().astype( np.int32) correct_all.append(correct) if correct[0].item() == 0: queries.append(0) not_done.append(0) # 原本就分类错了,not_done = 0 log.info( "The {}-th image is already classified incorrectly.") continue if self.targeted: if self.target_type == 'random': target_labels = torch.randint( low=0, high=CLASS_NUM[args.dataset], size=true_labels.size()).long().cuda() invalid_target_index = target_labels.eq(true_labels) while invalid_target_index.sum().item() > 0: target_labels[invalid_target_index] = torch.randint( low=0, high=logits.shape[1], size=target_labels[invalid_target_index].shape ).long().cuda() invalid_target_index = target_labels.eq(true_labels) elif args.target_type == 'least_likely': target_labels = logits.argmin(dim=1) elif args.target_type == "increment": target_labels = torch.fmod(true_labels + 1, CLASS_NUM[args.dataset]) else: raise NotImplementedError('Unknown target_type: {}'.format( args.target_type)) else: target_labels = None total += images.size(0) sigma = args.sigma np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed(0) adv_images = images.clone().cuda() assert images.size(0) == 1 l = self.xent_loss(logits, true_labels, target_labels) # 按照元素论文来写的,好奇怪 lr = float(learning_rate) total_q = 0 ite = 0 self.meta_model_for_q1.load_state_dict( self.pretrained_meta_weights) self.meta_model_for_q2.load_state_dict( self.pretrained_meta_weights) while total_q <= args.max_queries: total_q += 1 # true = torch.squeeze(self.get_grad(self.model, adv_images, true_labels, target_labels)) # C,H,W, # 其实没啥用,只是为了看看估计的准不准 # log.info("Grad norm : {:.3f}".format(torch.sqrt(torch.sum(true * true)).item())) if ite % 2 == 0 and sigma != args.sigma: log.info("checking if sigma could be set to be 1e-4") rand = torch.randn_like(adv_images) rand = torch.div( rand, torch.clamp(torch.sqrt( torch.mean(torch.mul(rand, rand))), min=1e-12)) logits_1 = self.model(adv_images + args.sigma * rand) rand_loss = self.xent_loss( logits_1, true_labels, target_labels) # shape = (batch_size,) total_q += 1 rand = torch.randn_like(adv_images) rand = torch.div( rand, torch.clamp(torch.sqrt( torch.mean(torch.mul(rand, rand))), min=1e-12)) logits_2 = self.model(adv_images + args.sigma * rand) rand_loss2 = self.xent_loss( logits_2, true_labels, target_labels) # shape = (batch_size,) total_q += 1 if (rand_loss - l)[0].item() != 0 and (rand_loss2 - l)[0].item() != 0: sigma = args.sigma log.info("set sigma back to 1e-4, sigma={:.4f}".format( sigma)) if args.method != "uniform": prior = torch.squeeze( self.get_grad(self.surrogate_model, adv_images, true_labels, target_labels)) # C,H,W # 下面求得余弦值 # alpha = torch.sum(true * prior) / torch.clamp(torch.sqrt(torch.sum(true * true) * torch.sum(prior * prior)), min=1e-12) # 这个alpha仅仅用来看看梯度对不对,后续会更新 # log.info("alpha = {:.3}".format(alpha)) prior = prior / torch.clamp(torch.sqrt( torch.mean(torch.mul(prior, prior))), min=1e-12) if args.method == "biased": start_iter = 3 # 是只有start_iter=3的时候算一下gradient norm if ite % 10 == 0 or ite == start_iter: # Estimate norm of true gradient s = 10 # pert shape = 10,C,H,W pert = torch.randn(size=(s, adv_images.size(1), adv_images.size(2), adv_images.size(3))) for i in range(s): pert[i] = pert[i] / torch.clamp(torch.sqrt( torch.mean(torch.mul(pert[i], pert[i]))), min=1e-12) pert = pert.cuda() # pert = (10,C,H,W), adv_images = (1,C,H,W) eval_points = adv_images + sigma * pert # broadcast, because tensor shape doesn't match exactly # eval_points shape = (10,C,H,W) reshape to (10*1, C, H, W) eval_points = eval_points.view(-1, adv_images.size(1), adv_images.size(2), adv_images.size(3)) target_labels_s = None if target_labels is not None: target_labels_s = target_labels.repeat(s) if ite % self.meta_predict_steps == 0: logits_for_q1 = self.model(eval_points) total_q += s self.finetune_meta_model(self.meta_model_for_q1, self.meta_optimizer_q1, eval_points, logits_for_q1) else: with torch.no_grad(): logits_for_q1 = self.meta_model_for_q1.forward( eval_points) losses = self.xent_loss( logits_for_q1, true_labels.repeat(s), target_labels_s) # shape = (10*B,) norm_square = torch.mean( ((losses - l) / sigma)**2) # scalar while True: logits_for_prior_loss = self.model( adv_images + sigma * prior) # prior may be C,H,W prior_loss = self.xent_loss( logits_for_prior_loss, true_labels, target_labels) # shape = (batch_size,) total_q += 1 diff_prior = (prior_loss - l)[0].item() if diff_prior == 0: sigma *= 2 log.info( "sigma={:.4f}, multiply sigma by 2".format( sigma)) else: break est_alpha = diff_prior / sigma / torch.clamp(torch.sqrt( torch.sum(torch.mul(prior, prior)) * norm_square), min=1e-12) est_alpha = est_alpha.item() log.info("Estimated alpha = {:.3f}".format(est_alpha)) if np.isnan(est_alpha): # est_alpha = np.nan_to_num(est_alpha) not_done.append(1) queries.append(args.max_queries) log.info("{}-th image failed because of nan".format( batch_idx)) break alpha = est_alpha # alpha描述了替代模型的梯度是否有用,alpha越大λ也越大,λ=1表示相信这个prior if alpha < 0: # 夹角大于90度,cos变成负数 prior = -prior # v = -v , negative the transfer gradient, alpha = -alpha q = args.samples_per_draw n = self.image_height * self.image_width * self.in_channels d = 50 * 50 * self.in_channels gamma = 3.5 A_square = d / n * gamma return_prior = False if args.method == 'biased': if args.dataprior: best_lambda = A_square * ( A_square - alpha**2 * (d + 2 * q - 2)) / (A_square**2 + alpha**4 * d**2 - 2 * A_square * alpha**2 * (q + d * q - 1)) else: best_lambda = (1 - alpha**2) * ( 1 - alpha**2 * (n + 2 * q - 2)) / (alpha**4 * n * (n + 2 * q - 2) - 2 * alpha**2 * n * q + 1) log.info("best_lambda = {:.4f}".format(best_lambda)) if best_lambda < 1 and best_lambda > 0: lmda = best_lambda else: if alpha**2 * (n + 2 * q - 2) < 1: lmda = 0 else: lmda = 1 if abs(alpha) >= 1: lmda = 1 log.info("lambda = {:.3f}".format(lmda)) if lmda == 1: return_prior = True # lmda =1, we trust this prior as true gradient elif args.method == "fixed_biased": lmda = 0.5 if not return_prior: if args.dataprior: upsample = nn.UpsamplingNearest2d( size=( adv_images.size(-2), adv_images.size(-1))) # H, W of original image pert = torch.randn(size=(q, self.in_channels, 50, 50)) pert = upsample(pert) else: pert = torch.randn( size=(q, adv_images.size(-3), adv_images.size(-2), adv_images.size(-1))) # q,C,H,W pert = pert.cuda() for i in range(q): if args.method == 'biased' or args.method == 'fixed_biased': angle_prior = torch.sum(pert[i] * prior) / \ torch.clamp(torch.sqrt(torch.sum(pert[i] * pert[i]) * torch.sum(prior * prior)),min=1e-12) # C,H,W x B,C,H,W pert[i] = pert[ i] - angle_prior * prior # prior = B,C,H,W so pert[i] = B,C,H,W # FIXME 这里不支持batch模式 pert[i] = pert[i] / torch.clamp(torch.sqrt( torch.mean(torch.mul(pert[i], pert[i]))), min=1e-12) # pert[i]就是论文算法1的第九行第二项的最右边的一串 pert[i] = np.sqrt(1 - lmda) * pert[i] + np.sqrt( lmda) * prior # paper's Algorithm 1: line 9 else: pert[i] = pert[i] / torch.clamp(torch.sqrt( torch.mean(torch.mul(pert[i], pert[i]))), min=1e-12) while True: eval_points = adv_images + sigma * pert # (1,C,H,W) pert=(q,C,H,W) if ite % self.meta_predict_steps == 0: logits_for_q2 = self.model(eval_points) total_q += q self.finetune_meta_model(self.meta_model_for_q2, self.meta_optimizer_q2, eval_points, logits_for_q2) else: with torch.no_grad(): logits_for_q2 = self.meta_model_for_q2.forward( eval_points) target_labels_q = None if target_labels is not None: target_labels_q = target_labels.repeat(q) losses = self.xent_loss( logits_for_q2, true_labels.repeat(q), target_labels_q) # shape = (q,) grad = (losses - l).view( -1, 1, 1, 1) * pert # (q,1,1,1) * (q,C,H,W) grad = torch.mean(grad, dim=0, keepdim=True) # 1,C,H,W norm_grad = torch.sqrt( torch.mean(torch.mul(grad, grad))) if norm_grad.item() == 0: sigma *= 5 log.info( "estimated grad == 0, multiply sigma by 5. Now sigma={:.4f}" .format(sigma)) else: break grad = grad / torch.clamp(torch.sqrt( torch.mean(torch.mul(grad, grad))), min=1e-12) def print_loss(model, direction): length = [1e-4, 1e-3] les = [] for ss in length: logits_p = model(adv_images + ss * direction) loss_p = self.xent_loss(logits_p, true_labels, target_labels) les.append((loss_p - l)[0].item()) log.info("losses: ".format(les)) if args.show_loss: if args.method == 'biased' or args.method == 'fixed_biased': show_input = adv_images + lr * prior logits_show = self.model(show_input) lprior = self.xent_loss(logits_show, true_labels, target_labels) - l print_loss(self.model, prior) show_input_2 = adv_images + lr * grad logits_show2 = self.model(show_input_2) lgrad = self.xent_loss(logits_show2, true_labels, target_labels) - l print_loss(self.model, grad) log.info(lprior, lgrad) else: grad = prior # log.info("angle = {:.4f}".format(torch.sum(true*grad) / # torch.clamp(torch.sqrt(torch.sum(true*true) * torch.sum(grad*grad)),min=1e-12))) if args.norm == "l2": adv_images = adv_images + lr * grad / torch.clamp( torch.sqrt(torch.mean(torch.mul(grad, grad))), min=1e-12) adv_images = self.l2_proj_step(images, eps, adv_images) else: if grad.dim() == 3: grad = grad.unsqueeze(0) adv_images = adv_images + lr * torch.sign(grad) adv_images = torch.min(torch.max(adv_images, images - eps), images + eps) adv_images = torch.clamp(adv_images, self.clip_min, self.clip_max) adv_labels = self.get_pred(self.model, adv_images) logits_ = self.model(adv_images) l = self.xent_loss(logits_, true_labels, target_labels) # log.info('queries:', total_q, 'loss:', l, 'learning rate:', lr, 'sigma:', sigma, 'prediction:', adv_labels, # 'distortion:', torch.max(torch.abs(adv_images - images)).item(), torch.norm((adv_images - images).view(images.size(0),-1)).item()) ite += 1 if (self.targeted and adv_labels[0].item() == target_labels[0].item()) \ or (not self.targeted and adv_labels[0].item() != true_labels[0].item()): log.info( "Success in {}-th image, Stop at queries : {}".format( batch_idx, total_q)) success += 1 not_done.append(0) queries.append(total_q) break else: not_done.append(1) queries.append( args.max_queries) # 因此不能用np.mean(queries)来计算,平均query次数 log.info( 'Attack {} success rate: {:.3f} Queries_mean: {:.3f} Queries_median: {:.3f}' .format(arch, success / total, np.mean(queries), np.median(queries))) correct_all = np.concatenate(correct_all, axis=0).astype(np.int32) query_all = np.array(queries).astype(np.int32) not_done_all = np.array(not_done).astype(np.int32) success = (1 - not_done_all) * correct_all success_query = success * query_all meta_info_dict = { "query_all": query_all.tolist(), "not_done_all": not_done_all.tolist(), "correct_all": correct_all.tolist(), "mean_query": np.mean(success_query[np.nonzero(success)[0]]).item(), "max_query": np.max(success_query[np.nonzero(success)[0]]).item(), "median_query": np.median(success_query[np.nonzero(success)[0]]).item(), "avg_not_done": np.mean(not_done_all.astype(np.float32)).item(), "args": vars(args) } with open(result_dump_path, "w") as result_file_obj: json.dump(meta_info_dict, result_file_obj, sort_keys=True) log.info("done, write stats info to {}".format(result_dump_path))
def l1norm(X, dim, eps=1e-8): """L1-normalize columns of X """ norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps X = torch.div(X, norm) return X
def conditional_lognormal_loss(model, x, t, e, pdf_u, pdf_c, hr_loss=False, imbalance_loss=False, elbo=True, risk=1): shape, scale, logits = model.forward(x) lossf = [] losss = [] k_ = shape b_ = scale loss_neg = 0 for g in range(model.k): mu = k_[:, g] sigma = b_[:, g] f = -sigma - 0.5 * np.log(2 * np.pi) f = f - torch.div((torch.log(t) - mu)**2, 2. * torch.exp(2 * sigma)) s = torch.div(torch.log(t) - mu, torch.exp(sigma) * np.sqrt(2)) s = 0.5 - 0.5 * torch.erf(s) s = torch.log(s) lossf.append(f) losss.append(s) # negative partial log likelihood hr = f - s loss_neg += PartialLogLikelihood()(hr, e) losss = torch.stack(losss, dim=1) lossf = torch.stack(lossf, dim=1) if elbo: lossg = nn.Softmax(dim=1)(logits) losss = lossg * losss lossf = lossg * lossf losss = losss.sum(dim=1) lossf = lossf.sum(dim=1) else: lossg = nn.LogSoftmax(dim=1)(logits) losss = lossg + losss lossf = lossg + lossf losss = torch.logsumexp(losss, dim=1) lossf = torch.logsumexp(lossf, dim=1) if imbalance_loss: try: idx_time = t.int().cpu().detach().numpy() idx_time[idx_time >= 10] = 9 pdf_u_ = torch.tensor(pdf_u).cuda() pdf_c_ = torch.tensor(pdf_c).cuda() lossf = lossf * ((1 - pdf_u_[idx_time]).exp()) losss = losss * ((1 - pdf_c_[idx_time]).exp()) except: pass uncens = np.where(e.cpu().data.numpy() == int(risk))[0] cens = np.where(e.cpu().data.numpy() != int(risk))[0] ll = lossf[uncens].sum() + model.discount * losss[cens].sum() if hr_loss and e.sum() > 0: return -ll / float(len(uncens) + len(cens)) + loss_neg * model.gamma else: return -ll / float(len(uncens) + len(cens))